Genotype-based prediction is more accurate within European populations due to European-based GWAS being larger in sample size, and the relatively low heteroegeniety/admixture within European populations compared to some non-European populations. Apart from phenotypic heterogenity across populations, a key reason why European GWAS do not predict well in non-European populations is due to differences in LD and MAF. Although the underlying causal variant maybe the same across populations, the variant best tagging the causal variant in one population may not wel tag the causal variant in another population.
Prediction can be improved in non-European populations by:
Even within super populations, differnces in ancestry will influence the predictive utility of polygenic scores. Accounting for ancestry within the PRS model may improve prediction. Fitting main effects of PCs is one approach, which has been shown to improve prediction over PRS alone. Furthermore, including an interaction term may be beneficial, as the scale of the PRS may vary within a population for non-causal reasons.
Estimate the predictive utility of polygenic scores within each super population. UKB contains individuals within each super population.
Global ancestry is typically defined using reference-projected genotype-based principal components. Often individuals are said to be of a population if they are within N SD of the population mean. Alternatively principal component estimates can be used in a machine learning based approach such as k-means clustering. I have used an elastic net model to predict global ancestry.
Further information can be found here: https://opain.github.io/UKB-GenoPrep/
Show code
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
library(data.table)
# Read list of individuals surviving QC in each population
# Read in fam to convert row number to application specific ID
# Row numbers in pheno file cannot be trusted
fam<-fread('/scratch/groups/ukbiobank/ukb18177_glanville/genotyped/ukb18177_glanville_binary_pre_qc.fam')
fam$V1<-seq(1:dim(fam)[1])
fam_2<-fread(paste0(UKBB_output,'/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr22.fam'))
fam_merged<-merge(fam, fam_2, by='V2')
keep<-list()
for(pop in c('EUR','AFR','SAS','EAS','AMR')){
keep[[pop]]<-fread(paste0('/scratch/groups/ukbiobank/usr/ollie_pain/ReQC/PostQC/UKB.postQC.',pop,'.keep'))
keep[[pop]]<-merge(keep[[pop]][,1], fam[,1:2], by='V1')
keep[[pop]]$V1<-keep[[pop]]$V2
keep_tmp<-fread(paste0('/scratch/groups/ukbiobank/usr/ollie_pain/ReQC/PostQC/UKB.postQC.',pop,'.keep'))
keep_tmp<-merge(keep_tmp[,1], fam_merged, by.x='V1', by.y='V1.x')
keep_tmp<-keep_tmp[,c('V1.y','V2'),with=F]
# Save keep file with updated IDs
write.table(keep[[pop]], paste0(UKBB_output,'/Phenotype/',pop,'.QC.keep'), col.names=F, row.names=F, quote=F)
write.table(keep_tmp, paste0(UKBB_output,'/Phenotype/',pop,'.QC.UpdateIDs.keep'), col.names=F, row.names=F, quote=F)
}
# Read in phenotype data
pheno<-c('Depression','Intelligence','BMI','Height','T2D','CAD','IBD','MultiScler','RheuArth')
pheno_file<-c('ever_depressed_pheno_final.UpdateIDs.txt','UKBB_Fluid.intelligence.score.UpdateIDs.pheno','UKBB_BMI.score.UpdateIDs.pheno','UKBB_Height.score.UpdateIDs.pheno','t2d_only_111119.UpdateIDs.txt','cad_only_111119.UpdateIDs.txt','UKBB.IBD.txt','UKBB.MultiScler.txt','UKBB.RheuArth.txt')
pheno_list<-list()
for(i in 1:length(pheno)){
pheno_list[[pheno[i]]]<-fread(paste0(UKBB_output,'/Phenotype/',pheno[i],'/',pheno_file[i]))
}
pheno_all<-Reduce(function(...) merge(..., all=T, by=c('FID','IID')), pheno_list)
names(pheno_all)<-c('FID','IID',pheno)
# Count number of non-missing values for phenotype in each population
pheno_dat_per_pop<-NULL
for(pop in c('EUR','AFR','SAS','EAS','AMR')){
pheno_all_pop<-pheno_all[(pheno_all$IID %in% keep[[pop]]$V1),]
for(i in 1:length(pheno)){
if(length(na.omit(unique(pheno_all[[pheno[i]]]))) == 2){
pheno_dat_per_pop<-rbind(pheno_dat_per_pop, data.frame(Pop=pop,
Phenotype=pheno[i],
N=sum(!is.na(pheno_all_pop[[pheno[i]]])),
Ncase=sum(pheno_all_pop[[pheno[i]]] == 1, na.rm=T),
Ncon=sum(pheno_all_pop[[pheno[i]]] == 0, na.rm=T)))
} else {
pheno_dat_per_pop<-rbind(pheno_dat_per_pop, data.frame(Pop=pop,
Phenotype=pheno[i],
N=sum(!is.na(pheno_all_pop[[pheno[i]]])),
Ncase=NA,
Ncon=NA))
}
}
}
# Idenitfy phenotype with at least 1000 indidivuals and 50 cases with data
pheno_dat_per_pop_retain<-pheno_dat_per_pop[which((pheno_dat_per_pop$N >=500 & pheno_dat_per_pop$Ncase >= 50) | (pheno_dat_per_pop$N >=500 & is.na(pheno_dat_per_pop$Ncase))),]
# BMI and Height are the only phenotypes meeting these criteria. This is fine for a preliminary analysis, but other phenotypes that are more available across ancestries should be explored. Intelligence was also available in EUR, AFR, and SAS.
# Save keep file with updated IDs for each phenotype, restricting the sample size to 50K
# Read in fam file with ID to match
system(paste0('mkdir ',UKBB_output,'/DiverseAncestry/Phenotype_subsets/'))
set.seed(1)
for(pop in c('EUR','AFR','SAS','EAS','AMR')){
for(pheno_i in c('BMI','Height')){
pheno_all_pop<-pheno_all[(pheno_all$IID %in% keep[[pop]]$V1),c('FID','IID',pheno_i), with=F]
pheno_all_pop<-pheno_all_pop[complete.cases(pheno_all_pop),]
if(dim(pheno_all_pop)[1] > 50000){
pheno_all_pop<-pheno_all_pop[sample(1:50000),]
}
write.table(pheno_all_pop[,1:2], paste0(UKBB_output,'/DiverseAncestry/Phenotype_subsets/UKB.',pheno_i,'.',pop,'.QC.keep'), col.names=F, row.names=F, quote=F)
}
}
pT + clump: Sparse
# Set required variables
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Create variable listing phenotypes and corresponding GWAS
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
# Calculate polygenic scores using 1KG reference
for i in $(seq 1 2);do
for pop in $(echo AFR AMR EAS EUR SAS);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
sbatch --mem 2G -p brc,shared -J pT_clump /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Scaled_polygenic_scorer/Scaled_polygenic_scorer.R \
--target_plink_chr ${UKBB_output}/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr \
--target_keep ${UKBB_output}/DiverseAncestry/Phenotype_subsets/UKB.${pheno_i}.${pop}.QC.keep \
--ref_score ${Geno_1KG_dir}/Score_files_for_polygenic/pt_clump/${gwas_i}/1KGPhase3.w_hm3.${gwas_i} \
--ref_scale ${Geno_1KG_dir}/Score_files_for_polygenic/pt_clump/${gwas_i}/1KGPhase3.w_hm3.${gwas_i}.${pop}.scale \
--ref_freq_chr ${Geno_1KG_dir}/freq_files/${pop}/1KGPhase3.w_hm3.${pop}.chr \
--plink ${plink1_9} \
--pheno_name ${gwas_i} \
--output ${UKBB_output}/DiverseAncestry/1KG_ref/pt_clump/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}
done
done
lassosum
# Set required variables
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Create variable listing phenotypes and corresponding GWAS
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
# Calculate polygenic scores using 1KG reference
for i in $(seq 1 2);do
for pop in $(echo AFR AMR EAS EUR SAS);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
sbatch -n 5 --mem 5G -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Scaled_polygenic_scorer_lassosum/Scaled_polygenic_scorer_lassosum.R \
--target_plink_chr ${UKBB_output}/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr \
--target_keep ${UKBB_output}/DiverseAncestry/Phenotype_subsets/UKB.${pheno_i}.${pop}.QC.keep \
--ref_score ${Geno_1KG_dir}/Score_files_for_polygenic/lassosum/${gwas_i}/1KGPhase3.w_hm3.${gwas_i} \
--ref_scale ${Geno_1KG_dir}/Score_files_for_polygenic/lassosum/${gwas_i}/1KGPhase3.w_hm3.${gwas_i}.${pop}.scale \
--ref_freq_chr ${Geno_1KG_dir}/freq_files/${pop}/1KGPhase3.w_hm3.${pop}.chr \
--pheno_name ${gwas_i} \
--n_cores 5 \
--plink ${plink1_9} \
--output ${UKBB_output}/DiverseAncestry/1KG_ref/lassosum/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}
done
done
PRScs
# Set required variables
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Create variable listing phenotypes and corresponding GWAS
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
# Calculate polygenic scores using 1KG reference
for i in $(seq 1 2);do
for pop in $(echo AFR AMR EAS EUR SAS);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
sbatch -n 1 --mem 5G -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Scaled_polygenic_scorer_PRScs/Scaled_polygenic_scorer_PRScs.R \
--target_plink_chr ${UKBB_output}/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr \
--target_keep ${UKBB_output}/DiverseAncestry/Phenotype_subsets/UKB.${pheno_i}.${pop}.QC.keep \
--ref_score ${Geno_1KG_dir}/Score_files_for_polygenic/PRScs/${gwas_i}/1KGPhase3.w_hm3.${gwas_i} \
--ref_scale ${Geno_1KG_dir}/Score_files_for_polygenic/PRScs/${gwas_i}/1KGPhase3.w_hm3.${gwas_i}.${pop}.scale \
--ref_freq_chr ${Geno_1KG_dir}/freq_files/${pop}/1KGPhase3.w_hm3.${pop}.chr \
--pheno_name ${gwas_i} \
--plink ${plink1_9} \
--output ${UKBB_output}/DiverseAncestry/1KG_ref/PRScs/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}
done
done
SBLUP
# Set required variables
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Create variable listing phenotypes and corresponding GWAS
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
# Calculate polygenic scores using 1KG reference
for i in $(seq 1 2);do
for pop in $(echo AFR AMR EAS EUR SAS);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
sbatch -n 1 --mem 5G -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Scaled_polygenic_scorer_SBLUP/Scaled_polygenic_scorer_SBLUP.R \
--target_plink_chr ${UKBB_output}/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr \
--target_keep ${UKBB_output}/DiverseAncestry/Phenotype_subsets/UKB.${pheno_i}.${pop}.QC.keep \
--ref_score ${Geno_1KG_dir}/Score_files_for_polygenic/SBLUP/${gwas_i}/GWAS_sumstats_SBLUP.sblup.cojo \
--ref_scale ${Geno_1KG_dir}/Score_files_for_polygenic/SBLUP/${gwas_i}/1KGPhase3.w_hm3.${gwas_i}.${pop}.scale \
--ref_freq_chr ${Geno_1KG_dir}/freq_files/${pop}/1KGPhase3.w_hm3.${pop}.chr \
--pheno_name ${gwas_i} \
--plink ${plink1_9} \
--output ${UKBB_output}/DiverseAncestry/1KG_ref/SBLUP/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}
done
done
SBayesR
# Set required variables
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Create variable listing phenotypes and corresponding GWAS
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
# Calculate polygenic scores using 1KG reference
for i in $(seq 1 2);do
for pop in $(echo AFR AMR EAS EUR SAS);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
sbatch -n 1 --mem 5G -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Scaled_polygenic_scorer_SBayesR/Scaled_polygenic_scorer_SBayesR.R \
\
--target_plink_chr ${UKBB_output}/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr \
--target_keep ${UKBB_output}/DiverseAncestry/Phenotype_subsets/UKB.${pheno_i}.${pop}.QC.keep \
--ref_score ${Geno_1KG_dir}/Score_files_for_polygenic/SBayesR/${gwas_i}_GCTB_203_robust/GWAS_sumstats_SBayesR.GW.snpRes \
--ref_scale ${Geno_1KG_dir}/Score_files_for_polygenic/SBayesR/${gwas_i}_GCTB_203_robust/1KGPhase3.w_hm3.${gwas_i}.${pop}.scale \
--ref_freq_chr ${Geno_1KG_dir}/freq_files/${pop}/1KGPhase3.w_hm3.${pop}.chr \
--pheno_name ${gwas_i} \
--plink ${plink1_9} \
--output ${UKBB_output}/DiverseAncestry/1KG_ref/SBayesR/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}
done
done
LDPred
# Set required variables
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Create variable listing phenotypes and corresponding GWAS
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
# Calculate polygenic scores using 1KG reference
for i in $(seq 1 2);do
for pop in $(echo AFR AMR EAS EUR SAS);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
sbatch -n 1 --mem 5G -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Scaled_polygenic_scorer_LDPred/Scaled_polygenic_scorer_LDPred.R \
--target_plink_chr ${UKBB_output}/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr \
--target_keep ${UKBB_output}/DiverseAncestry/Phenotype_subsets/UKB.${pheno_i}.${pop}.QC.keep \
--ref_score ${Geno_1KG_dir}/Score_files_for_polygenic/LDPred/${gwas_i}/1KGPhase3.w_hm3.${gwas_i} \
--ref_scale ${Geno_1KG_dir}/Score_files_for_polygenic/LDPred/${gwas_i}/1KGPhase3.w_hm3.${gwas_i}.${pop}.scale \
--ref_freq_chr ${Geno_1KG_dir}/freq_files/${pop}/1KGPhase3.w_hm3.${pop}.chr \
--pheno_name ${gwas_i} \
--plink ${plink1_9} \
--output ${UKBB_output}/DiverseAncestry/1KG_ref/LDPred/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}
done
done
LDPred2
# Set required variables
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Create variable listing phenotypes and corresponding GWAS
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
# Calculate polygenic scores using 1KG reference
for i in $(seq 1 2);do
for pop in $(echo AFR AMR EAS EUR SAS);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
sbatch -n 6 --mem 10G -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Scaled_polygenic_scorer_LDPred2/Scaled_polygenic_scorer_LDPred2.R \
--target_plink_chr ${UKBB_output}/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr \
--target_keep ${UKBB_output}/DiverseAncestry/Phenotype_subsets/UKB.${pheno_i}.${pop}.QC.keep \
--ref_score ${Geno_1KG_dir}/Score_files_for_polygenic/LDPred2/${gwas_i}/1KGPhase3.w_hm3.${gwas_i} \
--ref_scale ${Geno_1KG_dir}/Score_files_for_polygenic/LDPred2/${gwas_i}/1KGPhase3.w_hm3.${gwas_i}.${pop}.scale \
--ref_freq_chr ${Geno_1KG_dir}/freq_files/${pop}/1KGPhase3.w_hm3.${pop}.chr \
--pheno_name ${gwas_i} \
--n_cores 6 \
--plink ${plink1_9} \
--output ${UKBB_output}/DiverseAncestry/1KG_ref/LDPred2/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}
done
done
DBSLMM
# Set required variables
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Create variable listing phenotypes and corresponding GWAS
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
# Calculate polygenic scores using 1KG reference
for i in $(seq 1 2);do
for pop in $(echo AFR AMR EAS EUR SAS);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
sbatch --mem 10G -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Scaled_polygenic_scorer_DBSLMM/Scaled_polygenic_scorer_DBSLMM.R \
--target_plink_chr ${UKBB_output}/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr \
--target_keep ${UKBB_output}/DiverseAncestry/Phenotype_subsets/UKB.${pheno_i}.${pop}.QC.keep \
--ref_score ${Geno_1KG_dir}/Score_files_for_polygenic/DBSLMM/${gwas_i}/1KGPhase3.w_hm3.${gwas_i}.dbslmm.GW.txt \
--ref_scale ${Geno_1KG_dir}/Score_files_for_polygenic/DBSLMM/${gwas_i}/1KGPhase3.w_hm3.${gwas_i}.${pop}.scale \
--ref_freq_chr ${Geno_1KG_dir}/freq_files/${pop}/1KGPhase3.w_hm3.${pop}.chr \
--pheno_name ${gwas_i} \
--plink ${plink1_9} \
--output ${UKBB_output}/DiverseAncestry/1KG_ref/DBSLMM/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}
done
done
GeRS
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Create variable listing phenotypes and corresponding GWAS
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
# Calculate polygenic scores using 1KG reference
for pop in $(echo AFR AMR EAS EUR SAS);do
mkdir -p ${UKBB_output}/GeRS_for_diversity/1KG_ref/${pop}
> ${UKBB_output}/GeRS_for_diversity/1KG_ref/${pop}/todo.txt
for i in $(seq 1 2);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
for weights in $(cat ~/brc_scratch/Data/TWAS_sumstats/FUSION/snp_weight_list.txt);do
if [ ! -f ${UKBB_output}/GeRS_for_diversity/1KG_ref/${pop}/${weights}/UKBB.w_hm3.EUR.${weights}.${gwas_i}.fiprofile ]; then
echo $gwas_i $pheno_i $weights >> ${UKBB_output}/GeRS_for_diversity/1KG_ref/${pop}/todo.txt
fi
done
done
cat <<EOF >${UKBB_output}/GeRS_for_diversity/1KG_ref/${pop}/todo_array.sh
#!/bin/bash
#SBATCH -p shared,brc
#SBATCH --mem 10G
#SBATCH -n 1
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
gwas=\$(awk -v var=\${SLURM_ARRAY_TASK_ID} 'NR == var {print \$1}' ${UKBB_output}/GeRS_for_diversity/1KG_ref/${pop}/todo.txt)
pheno=\$(awk -v var=\${SLURM_ARRAY_TASK_ID} 'NR == var {print \$2}' ${UKBB_output}/GeRS_for_diversity/1KG_ref/${pop}/todo.txt)
weights=\$(awk -v var=\${SLURM_ARRAY_TASK_ID} 'NR == var {print \$3}' ${UKBB_output}/GeRS_for_diversity/1KG_ref/${pop}/todo.txt)
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/scaled_functionally_informed_risk_scorer/scaled_functionally_informed_risk_scorer.R \
--targ_feature_pred ${UKBB_output}/Predicted_expression/FUSION/${pop}/\${weights}/UKBB.w_hm3.QCd.AllSNP.FUSION.\${weights}.predictions.gz \
--target_keep ${UKBB_output}/DiverseAncestry/Phenotype_subsets/UKB.\${pheno}.${pop}.QC.keep \
--ref_score ${Geno_1KG_dir}/Score_files_for_functionally_informed_risk_scores/\${gwas}/1KGPhase3.w_hm3.EUR.FUSION.\${gwas}.\${weights}.score \
--ref_scale ${Geno_1KG_dir}/Score_files_for_functionally_informed_risk_scores/\${gwas}/1KGPhase3.w_hm3.EUR.FUSION.\${gwas}.\${weights}.scale \
--pheno_name \${gwas} \
--n_cores 1 \
--pigz ${pigz_binary} \
--output ${UKBB_output}/GeRS_for_diversity/1KG_ref/${pop}/\${weights}/UKBB.w_hm3.EUR.\${weights}.\${gwas}
EOF
done
for pop in $(echo AFR AMR EAS EUR SAS);do
sbatch --array=1-$(wc -l ${UKBB_output}/GeRS_for_diversity/1KG_ref/${pop}/todo.txt | cut -d ' ' -f 1)%10 ${UKBB_output}/GeRS_for_diversity/1KG_ref/${pop}/todo_array.sh
done
GeRS (coloc)
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Create variable listing phenotypes and corresponding GWAS
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
# Calculate polygenic scores using 1KG reference
for pop in $(echo AFR AMR EAS EUR SAS);do
mkdir -p ${UKBB_output}/GeRS_for_diversity/1KG_ref_withCOLOC/${pop}
> ${UKBB_output}/GeRS_for_diversity/1KG_ref_withCOLOC/${pop}/todo.txt
for i in $(seq 1 2);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
for weights in $(cat ~/brc_scratch/Data/TWAS_sumstats/FUSION/snp_weight_list.txt);do
if [ ! -f ${UKBB_output}/GeRS_for_diversity/1KG_ref_withCOLOC/${pop}/${weights}/UKBB.w_hm3.EUR.${weights}.${gwas_i}.fiprofile ]; then
echo $gwas_i $pheno_i $weights >> ${UKBB_output}/GeRS_for_diversity/1KG_ref_withCOLOC/${pop}/todo.txt
fi
done
done
cat <<EOF >${UKBB_output}/GeRS_for_diversity/1KG_ref_withCOLOC/${pop}/todo_array.sh
#!/bin/bash
#SBATCH -p shared,brc
#SBATCH --mem 10G
#SBATCH -n 1
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
gwas=\$(awk -v var=\${SLURM_ARRAY_TASK_ID} 'NR == var {print \$1}' ${UKBB_output}/GeRS_for_diversity/1KG_ref_withCOLOC/${pop}/todo.txt)
pheno=\$(awk -v var=\${SLURM_ARRAY_TASK_ID} 'NR == var {print \$2}' ${UKBB_output}/GeRS_for_diversity/1KG_ref_withCOLOC/${pop}/todo.txt)
weights=\$(awk -v var=\${SLURM_ARRAY_TASK_ID} 'NR == var {print \$3}' ${UKBB_output}/GeRS_for_diversity/1KG_ref_withCOLOC/${pop}/todo.txt)
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/scaled_functionally_informed_risk_scorer/scaled_functionally_informed_risk_scorer.R \
--targ_feature_pred ${UKBB_output}/Predicted_expression/FUSION/${pop}/\${weights}/UKBB.w_hm3.QCd.AllSNP.FUSION.\${weights}.predictions.gz \
--target_keep ${UKBB_output}/DiverseAncestry/Phenotype_subsets/UKB.\${pheno}.${pop}.QC.keep \
--ref_score ${Geno_1KG_dir}/Score_files_for_functionally_informed_risk_scores/\${gwas}_COLOC_PP4/1KGPhase3.w_hm3.EUR.FUSION.\${gwas}.\${weights}.score \
--ref_scale ${Geno_1KG_dir}/Score_files_for_functionally_informed_risk_scores/\${gwas}_COLOC_PP4/1KGPhase3.w_hm3.EUR.FUSION.\${gwas}.\${weights}.scale \
--pheno_name \${gwas} \
--n_cores 1 \
--pigz ${pigz_binary} \
--output ${UKBB_output}/GeRS_for_diversity/1KG_ref_withCOLOC/${pop}/\${weights}/UKBB.w_hm3.EUR.\${weights}.\${gwas}
EOF
done
for pop in $(echo EUR AFR SAS EAS AMR);do
sbatch --array=1-$(wc -l ${UKBB_output}/GeRS_for_diversity/1KG_ref_withCOLOC/${pop}/todo.txt | cut -d ' ' -f 1)%10 ${UKBB_output}/GeRS_for_diversity/1KG_ref_withCOLOC/${pop}/todo_array.sh
done
Note. Some GeRS are not being calculated. Investigate why.
Reference project PCs have already been calculated, as described in the ‘Define ancestry…’ section. We just need to update the IDs to match the genetic data.
PCs
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
library(data.table)
fam<-fread('/scratch/groups/ukbiobank/ukb18177_glanville/genotyped/ukb18177_glanville_binary_pre_qc.fam')
fam$V1<-seq(1:dim(fam)[1])
fam_2<-fread(paste0(UKBB_output,'/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr22.fam'))
fam_merged<-merge(fam, fam_2, by='V2')
for(pop in c('EUR','AFR','SAS','EAS','AMR')){
PCs<-fread(paste0('/scratch/groups/ukbiobank/usr/ollie_pain/ReQC/defining_ancestry/UKBB.Ancestry.',pop,'.eigenvec'))
PCs<-merge(PCs, fam_merged, by.x='FID', by.y='V1.x')
PCs<-PCs[,c('V1.y','V2','PC1','PC2','PC3','PC4','PC5', 'PC6'),with=F]
names(PCs)<-c('FID','IID','PC1','PC2','PC3','PC4','PC5', 'PC6')
write.table(PCs, paste0(UKBB_output,'/DiverseAncestry/1KG_ref/PCs/UKBB.Ancestry.',pop,'.UpdateIDs.eigenvec'), col.names=T, row.names=F, quote=F)
}
pT + clump comparison
##############################
# Evaluating predictive utility of pT + clump PRSs across multiple pTs individually and in combination
##############################
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Make required directories
for pheno_i in $(echo BMI Height);do
mkdir -p /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRSs
done
# Create a file listing the predictors files
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
for i in $(seq 1 2);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
for pop in $(echo AFR AMR EAS EUR SAS);do
cat > /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRSs/UKBB.${pop}.w_hm3.${gwas_i}.EUR-PRSs.predictor_groups <<EOF
predictors
${UKBB_output}/DiverseAncestry/1KG_ref/pt_clump/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}.profiles
EOF
done
done
# Derive and evaluate models
pheno=$(echo BMI Height)
pheno_file=$(echo UKBB_BMI.score.UpdateIDs.pheno UKBB_Height.score.UpdateIDs.pheno)
gwas=$(echo BODY04 HEIG03)
prev=$(echo NA NA)
# pT + clump (sparse)
for i in $(seq 1 2);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
pheno_file_i=$(echo ${pheno_file} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
prev_i=$(echo ${prev} | cut -f ${i} -d ' ')
for pop in $(echo AFR AMR EAS EUR SAS);do
sbatch --mem 10G -n 1 -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Model_builder/Model_builder_V2_nested.R \
--pheno ${UKBB_output}/Phenotype/${pheno_i}/${pheno_file_i} \
--out /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRSs/UKBB.${pop}.w_hm3.${gwas_i}.EUR-PRSs \
--n_core 1 \
--compare_predictors T \
--assoc T \
--outcome_pop_prev ${prev_i} \
--predictors /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRSs/UKBB.${pop}.w_hm3.${gwas_i}.EUR-PRSs.predictor_groups
done
done
Comparison of PRS methods
##############################
# Evaluating predictive utility of pT + clump PRSs across multiple pTs individually and in combination
##############################
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Make required directories
for pheno_i in $(echo BMI Height);do
mkdir -p /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRSs
done
# Create a file listing the predictors files
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
for i in $(seq 1 2);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
for pop in $(echo AFR AMR EAS EUR SAS);do
cat > /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRSs/UKBB.${pop}.w_hm3.${gwas_i}.EUR-PRSs.AllMethodComp.predictor_groups <<EOF
predictors group
${UKBB_output}/DiverseAncestry/1KG_ref/pt_clump/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}.profiles pt_clump
${UKBB_output}/DiverseAncestry/1KG_ref/lassosum/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}.lassosum_profiles lassosum
${UKBB_output}/DiverseAncestry/1KG_ref/PRScs/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}.PRScs_profiles PRScs
${UKBB_output}/DiverseAncestry/1KG_ref/SBLUP/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}.SBLUP_profiles SBLUP
${UKBB_output}/DiverseAncestry/1KG_ref/SBayesR/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}.SBayesR_profiles SBayesR
${UKBB_output}/DiverseAncestry/1KG_ref/LDPred/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}.LDPred_profiles LDPred
${UKBB_output}/DiverseAncestry/1KG_ref/LDPred2/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}.LDPred_profiles LDPred2
${UKBB_output}/DiverseAncestry/1KG_ref/DBSLMM/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}.DBSLMM_profiles DBSLMM
EOF
done
done
# Derive and evaluate models
pheno=$(echo BMI Height)
pheno_file=$(echo UKBB_BMI.score.UpdateIDs.pheno UKBB_Height.score.UpdateIDs.pheno)
gwas=$(echo BODY04 HEIG03)
prev=$(echo NA NA)
for i in $(seq 1 2);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
pheno_file_i=$(echo ${pheno_file} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
prev_i=$(echo ${prev} | cut -f ${i} -d ' ')
for pop in $(echo AFR AMR EAS EUR SAS);do
sbatch --mem 20G -n 1 -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Model_builder/Model_builder_V2_nested.R \
--pheno ${UKBB_output}/Phenotype/${pheno_i}/${pheno_file_i} \
--out /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRSs/UKBB.${pop}.w_hm3.${gwas_i}.EUR-PRSs.AllMethodComp \
--n_core 1 \
--compare_predictors T \
--assoc T \
--outcome_pop_prev ${prev_i} \
--predictors /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRSs/UKBB.${pop}.w_hm3.${gwas_i}.EUR-PRSs.AllMethodComp.predictor_groups
done
done
# Note. The nested cross validation model builder is not ideal when there are multiple parameters for each method. Previously we have identifying the best single parameter in the training sample, and then testing it out of sample. With nested cross validation, we are selecting the best pT based on results out of sample. Probably doesn't make a big diffrence, but will lead to slight inflation of R2 of single parameters PRS. I like using the full sample though, so I could add an option select best predictor within each group, rather than nested cross validate single predictors which is quite pointless. Leave as is for now.
GeRS + PRS
##############################
# Evaluating predictive utility of GeRS and PRS individually and in combination
##############################
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Make required directories
for pheno_i in $(echo BMI Height);do
mkdir -p /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_GeRSs
done
# Create a file listing the predictors files
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
weights=$(cat ${TWAS_rep}/snp_weight_list.txt)
for i in $(seq 1 2);do
for pop in $(echo AFR AMR EAS EUR SAS);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
echo "predictors group" > /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.${gwas_i}.${pop}-GeRSs.${pop}-PRSs.pt_clump.predictor_groups
for weight in ${weights}; do
if [ -f ${UKBB_output}/GeRS_for_diversity/1KG_ref/${pop}/${weight}/UKBB.w_hm3.EUR.${weight}.${gwas_i}.fiprofile ]; then
echo ${UKBB_output}/GeRS_for_diversity/1KG_ref/${pop}/${weight}/UKBB.w_hm3.EUR.${weight}.${gwas_i}.fiprofile GeRS >> /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.${gwas_i}.${pop}-GeRSs.${pop}-PRSs.pt_clump.predictor_groups
fi
done
echo ${UKBB_output}/DiverseAncestry/1KG_ref/pt_clump/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}.profiles PRS >> /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.${gwas_i}.${pop}-GeRSs.${pop}-PRSs.pt_clump.predictor_groups
done
done
# Derive and evaluate models
pheno=$(echo BMI Height)
pheno_file=$(echo UKBB_BMI.score.UpdateIDs.pheno UKBB_Height.score.UpdateIDs.pheno)
gwas=$(echo BODY04 HEIG03)
prev=$(echo NA NA)
# 1KG reference
for i in $(seq 1 2);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
pheno_file_i=$(echo ${pheno_file} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
prev_i=$(echo ${prev} | cut -f ${i} -d ' ')
for pop in $(echo AFR AMR EAS EUR SAS);do
sbatch --mem 10G -n 1 -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Model_builder/Model_builder_V2_nested.R \
--pheno ${UKBB_output}/Phenotype/${pheno_i}/${pheno_file_i} \
--out /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.${gwas_i}.${pop}-GeRSs.${pop}-PRSs.pt_clump \
--n_core 1 \
--compare_predictors F \
--assoc T \
--outcome_pop_prev ${prev_i} \
--predictors /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.${gwas_i}.${pop}-GeRSs.${pop}-PRSs.pt_clump.predictor_groups
done
done
GeRS (coloc) + PRS
##############################
# Evaluating predictive utility of GeRS and PRS individually and in combination
##############################
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Make required directories
for pheno_i in $(echo BMI Height);do
mkdir -p /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_GeRSs
done
# Create a file listing the predictors files
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
weights=$(cat ${TWAS_rep}/snp_weight_list.txt)
for i in $(seq 1 2);do
for pop in $(echo EUR EAS AMR SAS AFR);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
echo "predictors group" > /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.${gwas_i}.${pop}-GeRSs_coloc.${pop}-PRSs.pt_clump.predictor_groups
for weight in ${weights}; do
if [ -f ${UKBB_output}/GeRS_for_diversity/1KG_ref_withCOLOC/${pop}/${weight}/UKBB.w_hm3.EUR.${weight}.${gwas_i}.fiprofile ]; then
echo ${UKBB_output}/GeRS_for_diversity/1KG_ref_withCOLOC/${pop}/${weight}/UKBB.w_hm3.EUR.${weight}.${gwas_i}.fiprofile GeRS >> /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.${gwas_i}.${pop}-GeRSs_coloc.${pop}-PRSs.pt_clump.predictor_groups
fi
done
echo ${UKBB_output}/DiverseAncestry/1KG_ref/pt_clump/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}.profiles PRS >> /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.${gwas_i}.${pop}-GeRSs_coloc.${pop}-PRSs.pt_clump.predictor_groups
done
done
# Derive and evaluate models
pheno=$(echo BMI Height)
pheno_file=$(echo UKBB_BMI.score.UpdateIDs.pheno UKBB_Height.score.UpdateIDs.pheno)
gwas=$(echo BODY04 HEIG03)
prev=$(echo NA NA)
# 1KG reference
for i in $(seq 1 2);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
pheno_file_i=$(echo ${pheno_file} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
prev_i=$(echo ${prev} | cut -f ${i} -d ' ')
for pop in $(echo AFR AMR EAS EUR SAS);do
sbatch --mem 10G -n 1 -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Model_builder/Model_builder_V2_nested.R \
--pheno ${UKBB_output}/Phenotype/${pheno_i}/${pheno_file_i} \
--out /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.${gwas_i}.${pop}-GeRSs_coloc.${pop}-PRSs.pt_clump \
--n_core 1 \
--compare_predictors F \
--assoc T \
--outcome_pop_prev ${prev_i} \
--predictors /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.${gwas_i}.${pop}-GeRSs_coloc.${pop}-PRSs.pt_clump.predictor_groups
done
done
PCs
##############################
# Evaluating predictive utility of PCs in combination with PRS
##############################
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Make required directories
for pheno_i in $(echo BMI Height);do
mkdir -p /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs
done
# Create a file listing the predictors files
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
for i in $(seq 1 2);do
for pop in $(echo EUR EAS AMR SAS AFR);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
echo "predictors group" > /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump.predictor_groups
echo ${UKBB_output}/DiverseAncestry/1KG_ref/PCs/UKBB.Ancestry.${pop}.UpdateIDs.eigenvec PCs >> /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump.predictor_groups
echo ${UKBB_output}/DiverseAncestry/1KG_ref/pt_clump/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}.profiles PRS >> /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump.predictor_groups
done
done
# Derive and evaluate models
pheno=$(echo BMI Height)
pheno_file=$(echo UKBB_BMI.score.UpdateIDs.pheno UKBB_Height.score.UpdateIDs.pheno)
gwas=$(echo BODY04 HEIG03)
prev=$(echo NA NA)
# 1KG reference
for i in $(seq 1 2);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
pheno_file_i=$(echo ${pheno_file} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
prev_i=$(echo ${prev} | cut -f ${i} -d ' ')
for pop in $(echo AFR AMR EAS EUR SAS);do
sbatch --mem 10G -n 1 -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Model_builder/Model_builder_V2_nested.R \
--pheno ${UKBB_output}/Phenotype/${pheno_i}/${pheno_file_i} \
--out /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump \
--n_core 1 \
--compare_predictors F \
--assoc T \
--outcome_pop_prev ${prev_i} \
--predictors /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump.predictor_groups
done
done
PCs with interactions
Rather than modifying the model builder script. Calculate PRS and PC interactions in advance.
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config')
dir.create(paste0(UKBB_output,'/DiverseAncestry/1KG_ref/PC_PRS_int'))
for(gwas in c('BODY04','HEIG03')){
for(pop in c('EUR','AFR','SAS','EAS','AMR')){
# Read in prs and PC data
pcs<-fread(paste0(UKBB_output,'/DiverseAncestry/1KG_ref/PCs/UKBB.Ancestry.',pop,'.UpdateIDs.eigenvec'))
prs<-fread(paste0(UKBB_output,'/DiverseAncestry/1KG_ref/pt_clump/',pop,'/',gwas,'/UKBB.',pop,'.w_hm3.',gwas,'.profiles'))
names(prs)[-1:-2]<-paste0('PRS',1:(dim(prs)[2]-2))
both<-merge(prs, pcs, by=c('FID','IID'))
interaction_list<-NULL
for(i in names(pcs[,-1:-2])){
interaction_list<-rbind(interaction_list, data.frame(x1=i, x2=names(prs[,-1:-2])))
}
interaction_list$x1<-as.character(interaction_list$x1)
interaction_list$x2<-as.character(interaction_list$x2)
int_dat<-NULL
for(i in 1:dim(interaction_list)[1]){
tmp<-as.numeric(scale(both[[interaction_list$x1[i]]]*both[[interaction_list$x2[i]]]))
int_dat<-data.frame(cbind(int_dat, tmp))
names(int_dat)[i]<-paste0(interaction_list$x1[i],'_',interaction_list$x2[i])
}
int_dat<-cbind(both[,1:2],int_dat)
fwrite(int_dat, paste0(UKBB_output,'/DiverseAncestry/1KG_ref/PC_PRS_int/UKBB.',gwas,'.',pop,'.prs_pc_interactions.txt'), quote=F, na='NA', sep=' ')
}
}
##############################
# Evaluating predictive utility of PCs in combination with PRS
##############################
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Make required directories
for pheno_i in $(echo BMI Height);do
mkdir -p /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs
done
# Create a file listing the predictors files
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
for i in $(seq 1 2);do
for pop in $(echo EUR EAS AMR SAS AFR);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
echo "predictors group" > /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump-interactions.predictor_groups
echo ${UKBB_output}/DiverseAncestry/1KG_ref/PCs/UKBB.Ancestry.${pop}.UpdateIDs.eigenvec main >> /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump-interactions.predictor_groups
echo ${UKBB_output}/DiverseAncestry/1KG_ref/pt_clump/${pop}/${gwas_i}/UKBB.${pop}.w_hm3.${gwas_i}.profiles main >> /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump-interactions.predictor_groups
echo ${UKBB_output}/DiverseAncestry/1KG_ref/PC_PRS_int/UKBB.${gwas_i}.${pop}.prs_pc_interactions.txt interaction >> /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump-interactions.predictor_groups
done
done
# Derive and evaluate models
pheno=$(echo BMI Height)
pheno_file=$(echo UKBB_BMI.score.UpdateIDs.pheno UKBB_Height.score.UpdateIDs.pheno)
gwas=$(echo BODY04 HEIG03)
prev=$(echo NA NA)
# 1KG reference
for i in $(seq 1 2);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
pheno_file_i=$(echo ${pheno_file} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
prev_i=$(echo ${prev} | cut -f ${i} -d ' ')
for pop in $(echo AFR AMR EAS EUR SAS);do
sbatch --mem 10G -n 1 -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Model_builder/Model_builder_V2_nested.R \
--pheno ${UKBB_output}/Phenotype/${pheno_i}/${pheno_file_i} \
--out /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump-interactions \
--n_core 1 \
--compare_predictors F \
--assoc T \
--outcome_pop_prev ${prev_i} \
--predictors /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump-interactions.predictor_groups
done
done
I expected interaction terms to improve prediction. They do not, and by including many predicors increase risk of overfitting.
PCs with PRS residuals
Rather than modifying the model builder script. Calculate PRS and PC interactions in advance.
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config')
dir.create(paste0(UKBB_output,'/DiverseAncestry/1KG_ref/PRS_residuals'))
for(gwas in c('BODY04','HEIG03')){
for(pop in c('EUR','AFR','SAS','EAS','AMR')){
# Read in prs and PC data
pcs<-fread(paste0(UKBB_output,'/DiverseAncestry/1KG_ref/PCs/UKBB.Ancestry.',pop,'.UpdateIDs.eigenvec'))
prs<-fread(paste0(UKBB_output,'/DiverseAncestry/1KG_ref/pt_clump/',pop,'/',gwas,'/UKBB.',pop,'.w_hm3.',gwas,'.profiles'))
names(prs)[-1:-2]<-paste0('PRS',1:(dim(prs)[2]-2))
both<-merge(prs, pcs, by=c('FID','IID'))
both_resid<-both[,grepl('FID|IID|PRS', names(both)), with=F]
for(i in names(prs[,-1:-2])){
both_resid[[i]]<-as.numeric(scale(resid(lm(as.formula(paste0(i,'~',paste0(names(pcs[,-1:-2]), collapse=' + '))),data=both))))
}
fwrite(both_resid, paste0(UKBB_output,'/DiverseAncestry/1KG_ref/PRS_residuals/UKBB.',gwas,'.',pop,'.prs_residuals.txt'), quote=F, na='NA', sep=' ')
}
}
##############################
# Evaluating predictive utility of PCs in combination with PRS
##############################
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Make required directories
for pheno_i in $(echo BMI Height);do
mkdir -p /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs
done
# Create a file listing the predictors files
pheno=$(echo BMI Height)
gwas=$(echo BODY04 HEIG03)
for i in $(seq 1 2);do
for pop in $(echo EUR EAS AMR SAS AFR);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
echo "predictors group" > /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump-residuals.predictor_groups
echo ${UKBB_output}/DiverseAncestry/1KG_ref/PCs/UKBB.Ancestry.${pop}.UpdateIDs.eigenvec PCs >> /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump-residuals.predictor_groups
echo ${UKBB_output}/DiverseAncestry/1KG_ref/PRS_residuals/UKBB.${gwas_i}.${pop}.prs_residuals.txt PRS_resid >> /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump-residuals.predictor_groups
done
done
# Derive and evaluate models
pheno=$(echo BMI Height)
pheno_file=$(echo UKBB_BMI.score.UpdateIDs.pheno UKBB_Height.score.UpdateIDs.pheno)
gwas=$(echo BODY04 HEIG03)
prev=$(echo NA NA)
# 1KG reference
for i in $(seq 1 2);do
pheno_i=$(echo ${pheno} | cut -f ${i} -d ' ')
pheno_file_i=$(echo ${pheno_file} | cut -f ${i} -d ' ')
gwas_i=$(echo ${gwas} | cut -f ${i} -d ' ')
prev_i=$(echo ${prev} | cut -f ${i} -d ' ')
for pop in $(echo AFR AMR EAS EUR SAS);do
sbatch --mem 10G -n 1 -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Model_builder/Model_builder_V2_nested.R \
--pheno ${UKBB_output}/Phenotype/${pheno_i}/${pheno_file_i} \
--out /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump-residuals \
--n_core 1 \
--compare_predictors F \
--assoc T \
--outcome_pop_prev ${prev_i} \
--predictors /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno_i}/Association_withPRS_and_PCs/UKBB.w_hm3.${gwas_i}.${pop}-PCs-PRSs.pt_clump-residuals.predictor_groups
done
done
Residualising PRS in advance makes very little difference to the all model, but residualised PRS generally explain more variance in non-EUR populations. I think as suggested by the Alkes price paper, residualising in advance is a good idea, and also disentangles the population stratification in PRS from PCs.
pT + clump comparison
pop<-c('AFR','AMR','EAS','EUR','SAS')
pheno<-c('BMI','Height')
gwas<-c('BODY04','HEIG03')
library(data.table)
res<-list()
for(i in 1:length(gwas)){
res_pheno<-NULL
for(k in 1:length(pop)){
tmp<-fread(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRSs/UKBB.',pop[k],'.w_hm3.',gwas[i],'.EUR-PRSs.pred_eval.txt'))
tmp2<-data.frame( Population=pop[k],
Phenotype=pheno[i],
tmp)
res_pheno<-rbind(res_pheno, tmp2)
}
res_pheno$Model<-gsub('e.','e-',gsub('_.*','',gsub(paste0(gwas[i],'.'),'',res_pheno$Model)))
res_pheno$Model<-factor(res_pheno$Model, levels=unique(res_pheno$Model))
res[[pheno[i]]]<-res_pheno
}
library(ggplot2)
library(cowplot)
plot_list<-NULL
for(i in 1:length(gwas)){
plot_list[[pheno[i]]]<-ggplot(res[[pheno[i]]], aes(x=factor(Model), y=R, colour=Population)) +
geom_point(stat="identity", position=position_dodge( width=.25)) +
geom_errorbar(aes(ymin=R-SE, ymax=R+SE), width=.2, position=position_dodge(.25)) +
theme_half_open() +
theme(axis.text.x = element_text(angle = 55, vjust = 1, hjust=1), plot.title = element_text(hjust = 0.4, size=12)) +
background_grid(major = 'y', minor = 'y') +
labs(x='pT',y='Predicted-Observed Correlation', title=pheno[i])
}
png('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/UKB.Diverse.pTclump_per_pT.png', units='px', res=300, width=2750, height=1000)
plot_grid(plotlist=plot_list, ncol = 2)
dev.off()
Show pT + clump results
Results are concordant with previous estimates, show the elastic net method improve prediction over individuals pTs, and that the optimal pT varies substantially across populations.
Plot comparison of PRS methods.
Show code
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config')
pheno<-c('Height','BMI')
gwas<-c('HEIG03','BODY04')
pop<-c('AFR','AMR','EAS','EUR','SAS')
##################
# Organise, make tables and figures for per phenotype results
##################
res_eval<-NULL
res_comp<-NULL
res_eval_plots<-list()
res_eval_diff_plots<-list()
res_eval_diff_matrix<-list()
j<-1
for(k in 1:length(pop)){
for(i in 1:length(pheno)){
res_eval_i<-read.table(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRSs/UKBB.',pop[k],'.w_hm3.',gwas[i],'.EUR-PRSs.AllMethodComp.pred_eval.txt'), header=T, stringsAsFactors=F)
res_comp_i<-read.table(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRSs/UKBB.',pop[k],'.w_hm3.',gwas[i],'.EUR-PRSs.AllMethodComp.pred_comp.txt'), header=T, stringsAsFactors=F)
# Idenitfy model names for 10FCVal, multi-PRS and pseudoval
selected_models<-NULL
##
# pT+clump (sparse)
##
# Identify best pT in 10FCVal
res_eval_i_per_pT<-res_eval_i[grepl('pt.clump.PredFile', res_eval_i$Model),]
selected_models<-rbind(selected_models,data.frame(Test='pT+clump.10FCVal', Label=
res_eval_i_per_pT[res_eval_i_per_pT$R == max(res_eval_i_per_pT$R),]$Model[1]))
# Identify multi pT
selected_models<-rbind(selected_models,data.frame(Test='pT+clump.MultiPRS', Label='pt.clump'))
##
# lassosum
##
# Identify best pT in 10FCVal
res_eval_i_per_lassosum<-res_eval_i[grepl('lassosum.PredFile', res_eval_i$Model),]
selected_models<-rbind(selected_models,data.frame(Test='lassosum.10FCVal', Label=
res_eval_i_per_lassosum[res_eval_i_per_lassosum$R == max(res_eval_i_per_lassosum$R),]$Model[1]))
# Identify multi pT
selected_models<-rbind(selected_models,data.frame(Test='lassosum.MultiPRS', Label='lassosum'))
# Identify pseudoVal
lassosum_val<-read.table(paste0('/users/k1806347/brc_scratch/Data/1KG/Phase3/Score_files_for_polygenic/lassosum/',gwas[i],'/1KGPhase3.w_hm3.',gwas[i],'.log'), sep='&')
lassosum_val_s<-as.numeric(gsub('s = ','',lassosum_val$V1[grepl('s = ',lassosum_val$V1)]))
lassosum_val_lambda<-as.numeric(gsub('lambda =','',lassosum_val$V1[grepl('lambda =',lassosum_val$V1)]))
lassosum_val_param<-paste0('s',lassosum_val_s,'.lambda',lassosum_val_lambda)
lassosum_val_param<-substr(lassosum_val_param, 1, nchar(lassosum_val_param)-1)
selected_models<-rbind(selected_models,data.frame(Test='lassosum.PseudoVal', Label=res_eval_i[grepl(lassosum_val_param, res_eval_i$Model),]$Model[1]))
##
# PRScs
##
# Identify best pT in 10FCVal
res_eval_i_per_PRScs<-res_eval_i[grepl('PRScs.PredFile', res_eval_i$Model),]
res_eval_i_per_PRScs<-res_eval_i_per_PRScs[!grepl('phiauto', res_eval_i_per_PRScs$Model),]
selected_models<-rbind(selected_models,data.frame(Test='PRScs.10FCVal', Label=res_eval_i_per_PRScs[res_eval_i_per_PRScs$R == max(res_eval_i_per_PRScs$R),]$Model[1]))
# Identify multi pT
selected_models<-rbind(selected_models,data.frame(Test='PRScs.MultiPRS', Label='PRScs'))
# Identify pseudoval
res_eval_i_per_PRScs<-res_eval_i[grepl('PRScs.PredFile', res_eval_i$Model),]
res_eval_i_per_PRScs<-res_eval_i_per_PRScs[grepl('phiauto', res_eval_i_per_PRScs$Model),]
selected_models<-rbind(selected_models,data.frame(Test='PRScs.PseudoVal', Label=res_eval_i_per_PRScs[res_eval_i_per_PRScs$R == max(res_eval_i_per_PRScs$R),]$Model[1]))
##
# SBLUP
##
# Identify PseudoVal
selected_models<-rbind(selected_models,data.frame(Test='SBLUP.Inf', Label='SBLUP'))
##
# SBayesR
##
# Identify PseudoVal
selected_models<-rbind(selected_models,data.frame(Test='SBayesR.PseudoVal', Label='SBayesR'))
##
# LDPred1
##
# Identify best pT in 10FCVal
res_eval_i_per_LDPred<-res_eval_i[grepl('LDPred.PredFile', res_eval_i$Model),]
res_eval_i_per_LDPred<-res_eval_i_per_LDPred[!grepl('.LDpred.inf', res_eval_i_per_LDPred$Model),]
selected_models<-rbind(selected_models,data.frame(Test='LDPred1.10FCVal', Label=res_eval_i_per_LDPred[res_eval_i_per_LDPred$R == max(res_eval_i_per_LDPred$R),]$Model[1]))
# Identify multi pT
selected_models<-rbind(selected_models,data.frame(Test='LDPred1.MultiPRS', Label='LDPred'))
# Identify pseudoval
res_eval_i_per_LDPred<-res_eval_i[grepl('LDPred.PredFile', res_eval_i$Model),]
res_eval_i_per_LDPred<-res_eval_i_per_LDPred[grepl('.LDpred.inf', res_eval_i_per_LDPred$Model),]
selected_models<-rbind(selected_models,data.frame(Test='LDPred1.Inf', Label=res_eval_i_per_LDPred[res_eval_i_per_LDPred$R == max(res_eval_i_per_LDPred$R),]$Model[1]))
##
# LDPred2
##
# Identify best pT in 10FCVal
res_eval_i_per_LDPred2<-res_eval_i[grepl('LDPred2.PredFile', res_eval_i$Model),]
res_eval_i_per_LDPred2<-res_eval_i_per_LDPred2[!grepl('beta.inf|beta.auto', res_eval_i_per_LDPred2$Model),]
selected_models<-rbind(selected_models,data.frame(Test='LDPred2.10FCVal', Label=res_eval_i_per_LDPred2[res_eval_i_per_LDPred2$R == max(res_eval_i_per_LDPred2$R),]$Model[1]))
# Identify multi pT
selected_models<-rbind(selected_models,data.frame(Test='LDPred2.MultiPRS', Label='LDPred2'))
# Identify pseudoval
res_eval_i_per_LDPred2<-res_eval_i[grepl('LDPred2.PredFile', res_eval_i$Model),]
res_eval_i_per_LDPred2<-res_eval_i_per_LDPred2[grepl('beta.inf', res_eval_i_per_LDPred2$Model),]
selected_models<-rbind(selected_models,data.frame(Test='LDPred2.Inf', Label=res_eval_i_per_LDPred2$Model))
res_eval_i_per_LDPred2<-res_eval_i[grepl('LDPred2.PredFile', res_eval_i$Model),]
res_eval_i_per_LDPred2<-res_eval_i_per_LDPred2[grepl('beta.auto', res_eval_i_per_LDPred2$Model),]
selected_models<-rbind(selected_models,data.frame(Test='LDPred2.PseudoVal', Label=res_eval_i_per_LDPred2$Model))
##
# DBSLMM
##
# Identify PseudoVal
selected_models<-rbind(selected_models,data.frame(Test='DBSLMM.PseudoVal', Label='DBSLMM'))
##
# All methods
##
selected_models<-rbind(selected_models,data.frame(Test='All.MultiPRS', Label='All'))
###
# Subset selected models and format
###
# Eval
selected_models$Label_2<-paste0(gsub('_group','',selected_models$Label),'_group')
res_eval_i_select<-res_eval_i[(res_eval_i$Model %in% selected_models$Label_2),]
res_eval_i_select<-merge(res_eval_i_select, selected_models[c('Test','Label_2')], by.x='Model',by.y='Label_2')
res_eval_i_select$Phenotype<-pheno[i]
res_eval_i_select<-res_eval_i_select[c('Phenotype','Test',"R","SE",'P','R2o','N')]
res_eval_i_select$Population<-pop[k]
res_eval<-rbind(res_eval,res_eval_i_select)
# Comp
selected_models$Label_3<-gsub('_group','',selected_models$Label)
res_comp_i_select<-res_comp_i[(res_comp_i$Model_1 %in% selected_models$Label_3) & (res_comp_i$Model_2 %in% selected_models$Label_3),]
res_comp_i_select<-merge(res_comp_i_select, selected_models[c('Test','Label_3')], by.x='Model_1',by.y='Label_3')
names(res_comp_i_select)[names(res_comp_i_select) == 'Test']<-'Model_1_Test'
res_comp_i_select<-merge(res_comp_i_select, selected_models[c('Test','Label_3')], by.x='Model_2',by.y='Label_3')
names(res_comp_i_select)[names(res_comp_i_select) == 'Test']<-'Model_2_Test'
res_comp_i_select$Label<-NULL
res_comp_i_select$Phenotype<-pheno[i]
res_comp_i_select<-res_comp_i_select[c('Phenotype','Model_1_Test','Model_2_Test','Model_1_R','Model_2_R','R_diff','R_diff_pval')]
res_comp<-rbind(res_comp,res_comp_i_select)
###
# Create plot showing performance of each method
###
library(ggplot2)
library(cowplot)
res_eval_i_select$Method<-gsub('\\..*','',res_eval_i_select$Test)
res_eval_i_select$Model<-gsub('.*\\.','',res_eval_i_select$Test)
res_eval_i_select$Method<-factor(res_eval_i_select$Method, levels=c('pT+clump','lassosum','PRScs','SBLUP','SBayesR','LDPred1','LDPred2','DBSLMM','All'))
res_eval_i_select$Model<-factor(res_eval_i_select$Model, levels=c('10FCVal','MultiPRS','PseudoVal','Inf'))
res_eval_plots[[j]] <- ggplot( res_eval_i_select, aes(x=Method, y=R, fill=Model)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.7) +
geom_errorbar(aes(ymin=R-SE, ymax=R+SE), width=.2, position=position_dodge(width = 0.7, preserve = "single")) +
labs(y="Correlation (SE)", x='', title=paste0(pheno[i],"(",pop[k],")")) +
coord_cartesian(ylim=c(min(res_eval_i_select$R[res_eval_i_select$Phenotype==pheno[i]])-0.02, max(res_eval_i_select$R[res_eval_i_select$Phenotype==pheno[i]])+0.025), clip="on") +
theme_half_open() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
background_grid(major = 'y', minor = 'y') +
theme(legend.position="top", legend.title = element_blank(), legend.box="vertical", legend.margin=margin()) +
guides(fill=guide_legend(nrow=2))
###
# Create plot showing performance of each method compared to pT+clump
###
res_eval_i_select$R_diff<-res_eval_i_select$R-res_eval_i_select$R[res_eval_i_select$Test == "pT+clump.10FCVal"]
res_eval_diff_plots[[j]] <- ggplot( res_eval_i_select, aes(x=Method, y=R_diff, fill=Model)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.7) +
geom_errorbar(aes(ymin=R_diff-SE, ymax=R_diff+SE), width=.2, position=position_dodge(width = 0.7, preserve = "single")) +
labs(y="Correlation (SE)", x='', title=paste0(pheno[i],"(",pop[k],")")) +
coord_cartesian(ylim=c(min(res_eval_i_select$R_diff[res_eval_i_select$Phenotype==pheno[i]])-0.02, max(res_eval_i_select$R_diff[res_eval_i_select$Phenotype==pheno[i]])+0.025), clip="on") +
theme_half_open() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
background_grid(major = 'y', minor = 'y') +
theme(legend.position="top", legend.title = element_blank(), legend.box="vertical", legend.margin=margin()) +
guides(fill=guide_legend(nrow=2))
###
# Create plot showing matrix of differences with significance
###
library(reshape2)
res_comp_i_select$R_diff[res_comp_i_select$Model_1_Test == res_comp_i_select$Model_2_Test]<-NA
res_comp_i_select$R_diff_catagory<-'NA'
res_comp_i_select$R_diff_catagory[res_comp_i_select$R_diff < -0.002]<-'-0.025 - -0.002'
res_comp_i_select$R_diff_catagory[res_comp_i_select$R_diff < -0.025]<-'-0.08 - -0.025'
res_comp_i_select$R_diff_catagory[res_comp_i_select$R_diff < -0.08]<-'< -0.08'
res_comp_i_select$R_diff_catagory[res_comp_i_select$R_diff > -0.002 & res_comp_i_select$R_diff < 0.002]<-'-0.002 - 0.002'
res_comp_i_select$R_diff_catagory[res_comp_i_select$R_diff > 0.002]<-'0.002 - 0.025'
res_comp_i_select$R_diff_catagory[res_comp_i_select$R_diff > 0.025]<-'0.025 - 0.08'
res_comp_i_select$R_diff_catagory[res_comp_i_select$R_diff > 0.08]<-'> 0.08'
res_comp_i_select$R_diff_catagory<-factor(res_comp_i_select$R_diff_catagory, level=rev(c('< -0.08','-0.08 - -0.025','-0.025 - -0.002','-0.002 - 0.002','0.002 - 0.025','0.025 - 0.08','> 0.08')))
res_comp_i_select$star<-' '
res_comp_i_select$star[res_comp_i_select$R_diff_pval < 0.05]<-'*'
res_comp_i_select$star[res_comp_i_select$R_diff_pval < 1e-3]<-'**'
res_comp_i_select$star[res_comp_i_select$R_diff_pval < 1e-6]<-'***'
library(RColorBrewer)
res_eval_diff_matrix[[j]]<-ggplot(data = res_comp_i_select, aes(Model_2_Test, Model_1_Test, fill=R_diff_catagory))+
geom_tile(color = "white")+
labs(y='Test', x='Comparison', title=paste0(pheno[i],"(",pop[k],")"), fill='R difference') +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 8, hjust = 1))+
coord_fixed() +
geom_text(data=res_comp_i_select, aes(Model_2_Test, Model_1_Test, label = star), color = "black", size = 4, angle = 0, vjust=0.8) +
scale_fill_brewer(breaks = levels(res_comp_i_select$R_diff_catagory), palette = "RdBu", drop=F, na.value='grey')
j<-j+1
}
}
write.csv(res_eval, paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp.csv'), row.names=F, quote=F)
write.csv(res_comp, paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_diff.csv'), row.names=F, quote=F)
png(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp.png'), units='px', res=300, width=2450, height=1000*ceiling(length(res_eval_plots)/2))
print(plot_grid(plotlist=res_eval_plots, ncol = 2))
dev.off()
png(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_pTDiff.png'), units='px', res=300, width=2450, height=1000*ceiling(length(res_eval_plots)/2))
print(plot_grid(plotlist=res_eval_diff_plots, ncol = 2))
dev.off()
png(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_diff.png'), units='px', res=300, width=2450, height=1750*length(res_eval_diff_matrix))
print(plot_grid(plotlist=res_eval_diff_matrix, ncol = 1))
dev.off()
# With only two phenotypes, meta-analysis is not necessary or helpful.
#####
# Calculate portability
#####
res_port<-NULL
res_port_plots<-list()
for(i in 1:length(pheno)){
for(k in 1:length(pop)){
if(pop[k] != 'EUR'){
res_eval_i_EUR<-res_eval[res_eval$Population == 'EUR' & res_eval$Phenotype == pheno[i],]
res_eval_i_k<-res_eval[res_eval$Population == pop[k] & res_eval$Phenotype == pheno[i],]
res_eval_i_k<-res_eval_i_k[match(res_eval_i_EUR$Test, res_eval_i_k$Test),]
res_eval_i_k$R_port<-res_eval_i_k$R/res_eval_i_EUR$R
res_eval_i_k$R2_port<-res_eval_i_k$R2o/res_eval_i_EUR$R2o
res_eval_i_k$Method<-gsub('\\..*','',res_eval_i_k$Test)
res_eval_i_k$Model<-gsub('.*\\.','',res_eval_i_k$Test)
res_eval_i_k$Method<-factor(res_eval_i_k$Method, levels=c('pT+clump','lassosum','PRScs','SBLUP','SBayesR','LDPred1','LDPred2','DBSLMM','All'))
res_eval_i_k$Model<-factor(res_eval_i_k$Model, levels=c('10FCVal','MultiPRS','PseudoVal','Inf'))
res_port_plots[[paste0(i,'_',k)]]<-ggplot( res_eval_i_k, aes(x=Method, y=R2_port, fill=Model)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.7) +
labs(y="Portability (Pairwise)", x='', title=paste0(pheno[i],"(",pop[k],")")) +
ylim(0,1) +
theme_half_open() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
background_grid(major = 'y', minor = 'y') +
theme(legend.position="top", legend.title = element_blank(), legend.box="vertical", legend.margin=margin()) +
guides(fill=guide_legend(nrow=2))
}
}
}
png(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_portability_R2.png'), units='px', res=300, width=2450, height=1000*ceiling(length(res_port_plots)/2))
print(plot_grid(plotlist=res_port_plots, ncol = 2))
dev.off()
###
# Calculate portability compared to pT+clump in EUR
###
res_port<-NULL
res_port_plots<-list()
for(i in 1:length(pheno)){
for(k in 1:length(pop)){
if(pop[k] != 'EUR'){
res_eval_i_EUR<-res_eval[res_eval$Population == 'EUR' & res_eval$Phenotype == pheno[i] & res_eval$Test == 'pT+clump.10FCVal',]
res_eval_i_k<-res_eval[res_eval$Population == pop[k] & res_eval$Phenotype == pheno[i],]
res_eval_i_k$R_port<-res_eval_i_k$R/res_eval_i_EUR$R
res_eval_i_k$R2_port<-res_eval_i_k$R2o/res_eval_i_EUR$R2o
res_eval_i_k$Method<-gsub('\\..*','',res_eval_i_k$Test)
res_eval_i_k$Model<-gsub('.*\\.','',res_eval_i_k$Test)
res_eval_i_k$Method<-factor(res_eval_i_k$Method, levels=c('pT+clump','lassosum','PRScs','SBLUP','SBayesR','LDPred1','LDPred2','DBSLMM','All'))
res_eval_i_k$Model<-factor(res_eval_i_k$Model, levels=c('10FCVal','MultiPRS','PseudoVal','Inf'))
res_port_plots[[paste0(i,'_',k)]]<-ggplot( res_eval_i_k, aes(x=Method, y=R2_port, fill=Model)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.7) +
labs(y="Portability (pT+Clump)", x='', title=paste0(pheno[i],"(",pop[k],")")) +
ylim(0,1.25) +
theme_half_open() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
background_grid(major = 'y', minor = 'y') +
theme(legend.position="top", legend.title = element_blank(), legend.box="vertical", legend.margin=margin()) +
guides(fill=guide_legend(nrow=2))
}
}
}
png(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_portability_R2_ptClump.png'), units='px', res=300, width=2450, height=1000*ceiling(length(res_port_plots)/2))
print(plot_grid(plotlist=res_port_plots, ncol = 2))
dev.off()
Show comparison results
Show code
pop<-c('AFR','AMR','EAS','EUR','SAS')
pheno<-c('BMI','Height')
gwas<-c('BODY04','HEIG03')
library(data.table)
res<-list()
for(i in 1:length(gwas)){
res_pheno<-NULL
for(k in 1:length(pop)){
tmp<-fread(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.',gwas[i],'.',pop[k],'-GeRSs.',pop[k],'-PRSs.pt_clump.pred_eval.txt'))
tmp<-tmp[!grepl('PredFile', tmp$Model),]
tmp2<-data.frame( Population=pop[k],
Phenotype=pheno[i],
tmp)
res_pheno<-rbind(res_pheno, tmp2)
}
res_pheno$Model<-gsub('_group','',res_pheno$Model)
res_pheno$Model<-factor(res_pheno$Model, levels=unique(res_pheno$Model))
res[[pheno[i]]]<-res_pheno
}
library(ggplot2)
library(cowplot)
plot_list<-NULL
for(i in 1:length(gwas)){
plot_list[[pheno[i]]]<-ggplot(res[[pheno[i]]], aes(x=factor(Model), y=R, colour=Population)) +
geom_point(stat="identity", position=position_dodge( width=.30)) +
geom_errorbar(aes(ymin=R-SE, ymax=R+SE), width=.2, position=position_dodge(.30)) +
theme_half_open() +
theme(axis.text.x = element_text(angle = 55, vjust = 1, hjust=1), plot.title = element_text(hjust = 0.4, size=12)) +
background_grid(major = 'y', minor = 'y') +
labs(x=NULL,y="Correlation (SE)", title=pheno[i])
if(i != length(gwas)){
plot_list[[pheno[i]]]<-plot_list[[pheno[i]]] + theme(legend.position = "none")
}
}
png('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/UKB.Diverse.GeRS_vs_pTclump.png', units='px', res=300, width=2250, height=900)
plot_grid(plotlist=plot_list, nrow = 1, rel_widths = c(1.3/3, 1.7/3))
dev.off()
res_2<-list()
for(i in 1:length(gwas)){
res_2_pheno<-NULL
for(k in 1:length(pop)){
tmp<-fread(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.',gwas[i],'.',pop[k],'-GeRSs.',pop[k],'-PRSs.pt_clump.pred_comp.txt'))
tmp<-tmp[tmp$Model_1 == 'All' & tmp$Model_2 == 'PRS',]
tmp2<-data.frame( Population=pop[k],
Phenotype=pheno[i],
tmp)
res_2_pheno<-rbind(res_2_pheno, tmp2)
}
res_2_pheno$Model_1<-'GeRS + PRS'
res_2_pheno$Model_2<-'PRS only'
res_2[[pheno[i]]]<-res_2_pheno
}
res_2_all<-do.call(rbind, res_2)
write.csv(res_2_all, '/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/UKB.Diverse.GeRS_vs_pTclump.csv', row.names=F, quote=F)
Show PRS vs. GeRS results
[1] 0.3628607 0.1922928 0.9302168 1.0000000 0.5654263 [1] 0.3677218 0.1864043 0.5321507 1.0000000 0.5796121
Population | Phenotype | Model_1 | Model_2 | Model_1_R | Model_2_R | R_diff | R_diff_pval |
---|---|---|---|---|---|---|---|
AFR | BMI | GeRS + PRS | PRS only | 0.166 | 0.164 | 0.002 | 6.81e-01 |
AMR | BMI | GeRS + PRS | PRS only | 0.121 | 0.117 | 0.004 | 8.37e-01 |
EAS | BMI | GeRS + PRS | PRS only | 0.266 | 0.198 | 0.068 | 1.96e-06 |
EUR | BMI | GeRS + PRS | PRS only | 0.276 | 0.271 | 0.005 | 1.02e-07 |
SAS | BMI | GeRS + PRS | PRS only | 0.207 | 0.206 | 0.001 | 6.55e-01 |
AFR | Height | GeRS + PRS | PRS only | 0.142 | 0.148 | -0.006 | 2.72e-01 |
AMR | Height | GeRS + PRS | PRS only | 0.275 | 0.205 | 0.070 | 4.12e-04 |
EAS | Height | GeRS + PRS | PRS only | 0.241 | 0.211 | 0.031 | 1.39e-02 |
EUR | Height | GeRS + PRS | PRS only | 0.328 | 0.322 | 0.006 | 1.47e-12 |
SAS | Height | GeRS + PRS | PRS only | 0.257 | 0.255 | 0.002 | 5.43e-01 |
GeRS significantly improves cross ancestry prediction for BMI in EAS, and Height in EAS and AMR. Relative impovement between 10-20%.
Investigate individual associations to determine what is driving the improved prediction.
Look up best GeRS
pop<-c('AFR','AMR','EAS','EUR','SAS')
pheno<-c('BMI','Height')
gwas<-c('BODY04','HEIG03')
library(data.table)
res<-list()
for(i in 1:length(gwas)){
res_pheno<-NULL
for(k in 1:length(pop)){
tmp<-fread(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.',gwas[i],'.',pop[k],'-GeRSs.',pop[k],'-PRSs.pt_clump.assoc.txt'))
tmp2<-data.frame( Population=pop[k],
Phenotype=pheno[i],
tmp)
res_pheno<-rbind(res_pheno, tmp2)
}
res[[pheno[i]]]<-res_pheno
}
res_bmi_eas<-res[['BMI']][res[['BMI']]$Population == 'EAS',]
res_bmi_eas[order(res_bmi_eas$P),]
# GeRS explain more variance than PRS. PredFile 34 (CMC.BRAIN.RNASEQ) and 23 (METSIM ADIPOSE).
pred_group<-fread('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/BMI/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.BODY04.EAS-GeRSs.EAS-PRSs.pt_clump.predictor_groups')
pred_group[c(23,34),]
# The GeRS correpond to CMC.BRAIN.RNASEQ and METSIM ADIPOSE. These are two of the best powered eqtl dataset but they are also relevent to the phenotype.
res_height_amr<-res[['Height']][res[['Height']]$Population == 'AMR',]
res_height_amr[order(res_height_amr$P),]
# GeRS explain more variance than PRS. PredFile 4, 6, 36
pred_group<-fread('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/Height/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.HEIG03.AMR-GeRSs.AMR-PRSs.pt_clump.predictor_groups')
pred_group[c(4,6,36),]
# The GeRS correpond to Artery_Aorta, Artery_Tibial, Muscle_Skeletal.
Show code
pop<-c('AFR','AMR','EAS','EUR','SAS')
pheno<-c('BMI','Height')
gwas<-c('BODY04','HEIG03')
library(data.table)
res<-list()
for(i in 1:length(gwas)){
res_pheno<-NULL
for(k in 1:length(pop)){
tmp<-fread(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.',gwas[i],'.',pop[k],'-GeRSs_coloc.',pop[k],'-PRSs.pt_clump.pred_eval.txt'))
tmp<-tmp[!grepl('PredFile', tmp$Model),]
tmp2<-data.frame( Population=pop[k],
Phenotype=pheno[i],
tmp)
res_pheno<-rbind(res_pheno, tmp2)
}
res_pheno$Model<-gsub('_group','',res_pheno$Model)
res_pheno$Model<-factor(res_pheno$Model, levels=unique(res_pheno$Model))
res[[pheno[i]]]<-res_pheno
}
library(ggplot2)
library(cowplot)
plot_list<-NULL
for(i in 1:length(gwas)){
plot_list[[pheno[i]]]<-ggplot(res[[pheno[i]]], aes(x=factor(Model), y=R, colour=Population)) +
geom_point(stat="identity", position=position_dodge( width=.30)) +
geom_errorbar(aes(ymin=R-SE, ymax=R+SE), width=.2, position=position_dodge(.30)) +
theme_half_open() +
theme(axis.text.x = element_text(angle = 55, vjust = 1, hjust=1), plot.title = element_text(hjust = 0.4, size=12)) +
background_grid(major = 'y', minor = 'y') +
labs(x=NULL,y="Correlation (SE)", title=pheno[i])
if(i != length(gwas)){
plot_list[[pheno[i]]]<-plot_list[[pheno[i]]] + theme(legend.position = "none")
}
}
png('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/UKB.Diverse.GeRS_coloc_vs_pTclump.png', units='px', res=300, width=2250, height=900)
plot_grid(plotlist=plot_list, nrow = 1, rel_widths = c(1.3/3, 1.7/3))
dev.off()
res_2<-list()
for(i in 1:length(gwas)){
res_2_pheno<-NULL
for(k in 1:length(pop)){
tmp<-fread(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.',gwas[i],'.',pop[k],'-GeRSs_coloc.',pop[k],'-PRSs.pt_clump.pred_comp.txt'))
tmp<-tmp[tmp$Model_1 == 'All' & tmp$Model_2 == 'PRS',]
tmp2<-data.frame( Population=pop[k],
Phenotype=pheno[i],
tmp)
res_2_pheno<-rbind(res_2_pheno, tmp2)
}
res_2_pheno$Model_1<-'GeRS + PRS'
res_2_pheno$Model_2<-'PRS only'
res_2[[pheno[i]]]<-res_2_pheno
}
res_2_all<-do.call(rbind, res_2)
write.csv(res_2_all, '/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/UKB.Diverse.GeRS_coloc_vs_pTclump.csv', row.names=F, quote=F)
Show PRS vs. GeRS (coloc) results
Population | Phenotype | Model_1 | Model_2 | Model_1_R | Model_2_R | R_diff | R_diff_pval |
---|---|---|---|---|---|---|---|
AFR | BMI | GeRS + PRS | PRS only | 0.163 | 0.164 | -0.001 | 5.83e-01 |
AMR | BMI | GeRS + PRS | PRS only | 0.134 | 0.117 | 0.017 | 2.87e-01 |
EAS | BMI | GeRS + PRS | PRS only | 0.259 | 0.198 | 0.061 | 9.79e-05 |
EUR | BMI | GeRS + PRS | PRS only | 0.272 | 0.271 | 0.001 | 4.42e-03 |
SAS | BMI | GeRS + PRS | PRS only | 0.204 | 0.206 | -0.002 | 4.04e-01 |
AFR | Height | GeRS + PRS | PRS only | 0.146 | 0.148 | -0.002 | 6.98e-01 |
AMR | Height | GeRS + PRS | PRS only | 0.261 | 0.205 | 0.056 | 4.83e-03 |
EAS | Height | GeRS + PRS | PRS only | 0.223 | 0.211 | 0.012 | 3.13e-01 |
EUR | Height | GeRS + PRS | PRS only | 0.329 | 0.322 | 0.007 | 2.54e-09 |
SAS | Height | GeRS + PRS | PRS only | 0.257 | 0.255 | 0.002 | 3.53e-01 |
Colocalisation does not improve portability of GeRS across populations.
Plot comparison results
library(ggplot2)
library(cowplot)
pheno<-c('BMI','Height')
gwas<-c('BODY04','HEIG03')
pred_eval_pop<-NULL
pred_comp_pop<-NULL
for(pop in c('AFR','AMR','EAS','EUR','SAS')){
for(i in 1:length(gwas)){
pred_eval<-read.table(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRS_and_PCs/UKBB.w_hm3.',gwas[i],'.',pop,'-PCs-PRSs.pt_clump.pred_eval.txt'), header=T, stringsAsFactors=F)
pred_eval$Phenotype<-pheno[i]
pred_eval$Population<-pop
pred_eval_pop<-rbind(pred_eval_pop, pred_eval)
pred_comp<-read.table(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRS_and_PCs/UKBB.w_hm3.',gwas[i],'.',pop,'-PCs-PRSs.pt_clump.pred_comp.txt'), header=T, stringsAsFactors=F)
pred_comp$Phenotype<-pheno[i]
pred_comp$Population<-pop
pred_comp_pop<-rbind(pred_comp_pop, pred_comp)
}
}
pred_eval_pop$Model<-gsub('_group','', pred_eval_pop$Model)
pred_eval_pop$Model<-factor(pred_eval_pop$Model, levels = c('PCs',"PRS",'All'))
png('/scratch/users/k1806347/Analyses/DiverseAncestry/PC_vs_PRS.png', units='px', res=300, width=2000, height=1000)
ggplot(pred_eval_pop, aes(x=Phenotype, y=R, fill=Model)) +
geom_bar(stat="identity", position=position_dodge(), width = 0.7) +
geom_errorbar(aes(ymin=R-SE, ymax=R+SE), width=.2, position=position_dodge(width = 0.7)) +
labs(y="Correlation (SE)", x='') +
theme_half_open() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_blank()) +
background_grid(major = 'y', minor = 'y') +
facet_grid(. ~ Population)
dev.off()
write.csv(pred_comp_pop, '/scratch/users/k1806347/Analyses/DiverseAncestry/PC_vs_PRS.csv', quote=F, row.names=F)
library(ggplot2)
library(cowplot)
pheno<-c('BMI','Height')
gwas<-c('BODY04','HEIG03')
pred_eval_pop<-NULL
pred_comp_pop<-NULL
for(pop in c('AFR','AMR','EAS','EUR','SAS')){
for(i in 1:length(gwas)){
pred_eval<-read.table(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRS_and_PCs/UKBB.w_hm3.',gwas[i],'.',pop,'-PCs-PRSs.pt_clump-residuals.pred_eval.txt'), header=T, stringsAsFactors=F)
pred_eval$Phenotype<-pheno[i]
pred_eval$Population<-pop
pred_eval_pop<-rbind(pred_eval_pop, pred_eval)
pred_comp<-read.table(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRS_and_PCs/UKBB.w_hm3.',gwas[i],'.',pop,'-PCs-PRSs.pt_clump-residuals.pred_comp.txt'), header=T, stringsAsFactors=F)
pred_comp$Phenotype<-pheno[i]
pred_comp$Population<-pop
pred_comp_pop<-rbind(pred_comp_pop, pred_comp)
}
}
pred_eval_pop$Model<-gsub('_group','', pred_eval_pop$Model)
pred_eval_pop$Model<-gsub('PRS.resid',"PRS (resid)", pred_eval_pop$Model)
pred_eval_pop$Model<-factor(pred_eval_pop$Model, levels = c('PCs',"PRS (resid)",'All'))
png('/scratch/users/k1806347/Analyses/DiverseAncestry/PC_vs_PRS_residuals.png', units='px', res=300, width=2000, height=1000)
ggplot(pred_eval_pop, aes(x=Phenotype, y=R, fill=Model)) +
geom_bar(stat="identity", position=position_dodge(), width = 0.7) +
geom_errorbar(aes(ymin=R-SE, ymax=R+SE), width=.2, position=position_dodge(width = 0.7)) +
labs(y="Correlation (SE)", x='') +
theme_half_open() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_blank()) +
background_grid(major = 'y', minor = 'y') +
facet_grid(. ~ Population)
dev.off()
write.csv(pred_comp_pop, '/scratch/users/k1806347/Analyses/DiverseAncestry/PC_vs_PRS_residuals.csv', quote=F, row.names=F)
Show PC vs PRS results
Model_1 | Model_2 | Model_1_R | Model_2_R | R_diff | R_diff_pval | Phenotype | Population |
---|---|---|---|---|---|---|---|
PCs | PCs | 0.051 | 0.051 | 0.000 | 1.00e+00 | BMI | AFR |
PCs | PRS | 0.051 | 0.123 | -0.072 | 8.34e-08 | BMI | AFR |
PCs | All | 0.051 | 0.119 | -0.068 | 1.84e-08 | BMI | AFR |
PRS | PCs | 0.123 | 0.051 | 0.072 | 8.34e-08 | BMI | AFR |
PRS | PRS | 0.123 | 0.123 | 0.000 | 1.00e+00 | BMI | AFR |
PRS | All | 0.123 | 0.119 | 0.004 | 1.04e-01 | BMI | AFR |
All | PCs | 0.119 | 0.051 | 0.068 | 1.84e-08 | BMI | AFR |
All | PRS | 0.119 | 0.123 | -0.004 | 1.04e-01 | BMI | AFR |
All | All | 0.119 | 0.119 | 0.000 | 1.00e+00 | BMI | AFR |
PCs | PCs | -0.003 | -0.003 | 0.000 | 1.00e+00 | Height | AFR |
PCs | PRS | -0.003 | 0.120 | -0.122 | 1.32e-12 | Height | AFR |
PCs | All | -0.003 | 0.132 | -0.135 | 2.01e-18 | Height | AFR |
PRS | PCs | 0.120 | -0.003 | 0.122 | 1.32e-12 | Height | AFR |
PRS | PRS | 0.120 | 0.120 | 0.000 | 1.00e+00 | Height | AFR |
PRS | All | 0.120 | 0.132 | -0.012 | 2.23e-02 | Height | AFR |
All | PCs | 0.132 | -0.003 | 0.135 | 2.01e-18 | Height | AFR |
All | PRS | 0.132 | 0.120 | 0.012 | 2.23e-02 | Height | AFR |
All | All | 0.132 | 0.132 | 0.000 | 1.00e+00 | Height | AFR |
PCs | PCs | 0.042 | 0.042 | 0.000 | 1.00e+00 | BMI | AMR |
PCs | PRS | 0.042 | 0.123 | -0.081 | 9.97e-03 | BMI | AMR |
PCs | All | 0.042 | 0.165 | -0.124 | 7.30e-08 | BMI | AMR |
PRS | PCs | 0.123 | 0.042 | 0.081 | 9.97e-03 | BMI | AMR |
PRS | PRS | 0.123 | 0.123 | 0.000 | 1.00e+00 | BMI | AMR |
PRS | All | 0.123 | 0.165 | -0.042 | 1.39e-02 | BMI | AMR |
All | PCs | 0.165 | 0.042 | 0.124 | 7.30e-08 | BMI | AMR |
All | PRS | 0.165 | 0.123 | 0.042 | 1.39e-02 | BMI | AMR |
All | All | 0.165 | 0.165 | 0.000 | 1.00e+00 | BMI | AMR |
PCs | PCs | 0.271 | 0.271 | 0.000 | 1.00e+00 | Height | AMR |
PCs | PRS | 0.271 | 0.207 | 0.065 | 3.57e-02 | Height | AMR |
PCs | All | 0.271 | 0.365 | -0.094 | 8.10e-10 | Height | AMR |
PRS | PCs | 0.207 | 0.271 | -0.065 | 3.57e-02 | Height | AMR |
PRS | PRS | 0.207 | 0.207 | 0.000 | 1.00e+00 | Height | AMR |
PRS | All | 0.207 | 0.365 | -0.159 | 2.21e-15 | Height | AMR |
All | PCs | 0.365 | 0.271 | 0.094 | 8.10e-10 | Height | AMR |
All | PRS | 0.365 | 0.207 | 0.159 | 2.21e-15 | Height | AMR |
All | All | 0.365 | 0.365 | 0.000 | 1.00e+00 | Height | AMR |
PCs | PCs | 0.167 | 0.167 | 0.000 | 1.00e+00 | BMI | EAS |
PCs | PRS | 0.167 | 0.178 | -0.012 | 7.09e-01 | BMI | EAS |
PCs | All | 0.167 | 0.242 | -0.076 | 3.63e-05 | BMI | EAS |
PRS | PCs | 0.178 | 0.167 | 0.012 | 7.09e-01 | BMI | EAS |
PRS | PRS | 0.178 | 0.178 | 0.000 | 1.00e+00 | BMI | EAS |
PRS | All | 0.178 | 0.242 | -0.064 | 7.71e-05 | BMI | EAS |
All | PCs | 0.242 | 0.167 | 0.076 | 3.63e-05 | BMI | EAS |
All | PRS | 0.242 | 0.178 | 0.064 | 7.71e-05 | BMI | EAS |
All | All | 0.242 | 0.242 | 0.000 | 1.00e+00 | BMI | EAS |
PCs | PCs | 0.104 | 0.104 | 0.000 | 1.00e+00 | Height | EAS |
PCs | PRS | 0.104 | 0.187 | -0.083 | 9.98e-03 | Height | EAS |
PCs | All | 0.104 | 0.214 | -0.110 | 1.01e-06 | Height | EAS |
PRS | PCs | 0.187 | 0.104 | 0.083 | 9.98e-03 | Height | EAS |
PRS | PRS | 0.187 | 0.187 | 0.000 | 1.00e+00 | Height | EAS |
PRS | All | 0.187 | 0.214 | -0.027 | 3.65e-02 | Height | EAS |
All | PCs | 0.214 | 0.104 | 0.110 | 1.01e-06 | Height | EAS |
All | PRS | 0.214 | 0.187 | 0.027 | 3.65e-02 | Height | EAS |
All | All | 0.214 | 0.214 | 0.000 | 1.00e+00 | Height | EAS |
PCs | PCs | 0.005 | 0.005 | 0.000 | 1.00e+00 | BMI | EUR |
PCs | PRS | 0.005 | 0.271 | -0.266 | 0.00e+00 | BMI | EUR |
PCs | All | 0.005 | 0.271 | -0.265 | 0.00e+00 | BMI | EUR |
PRS | PCs | 0.271 | 0.005 | 0.266 | 0.00e+00 | BMI | EUR |
PRS | PRS | 0.271 | 0.271 | 0.000 | 1.00e+00 | BMI | EUR |
PRS | All | 0.271 | 0.271 | 0.000 | 3.53e-01 | BMI | EUR |
All | PCs | 0.271 | 0.005 | 0.265 | 0.00e+00 | BMI | EUR |
All | PRS | 0.271 | 0.271 | 0.000 | 3.53e-01 | BMI | EUR |
All | All | 0.271 | 0.271 | 0.000 | 1.00e+00 | BMI | EUR |
PCs | PCs | 0.008 | 0.008 | 0.000 | 1.00e+00 | Height | EUR |
PCs | PRS | 0.008 | 0.322 | -0.314 | 0.00e+00 | Height | EUR |
PCs | All | 0.008 | 0.323 | -0.316 | 0.00e+00 | Height | EUR |
PRS | PCs | 0.322 | 0.008 | 0.314 | 0.00e+00 | Height | EUR |
PRS | PRS | 0.322 | 0.322 | 0.000 | 1.00e+00 | Height | EUR |
PRS | All | 0.322 | 0.323 | -0.002 | 2.83e-04 | Height | EUR |
All | PCs | 0.323 | 0.008 | 0.316 | 0.00e+00 | Height | EUR |
All | PRS | 0.323 | 0.322 | 0.002 | 2.83e-04 | Height | EUR |
All | All | 0.323 | 0.323 | 0.000 | 1.00e+00 | Height | EUR |
PCs | PCs | 0.092 | 0.092 | 0.000 | 1.00e+00 | BMI | SAS |
PCs | PRS | 0.092 | 0.202 | -0.110 | 6.52e-11 | BMI | SAS |
PCs | All | 0.092 | 0.245 | -0.153 | 3.53e-34 | BMI | SAS |
PRS | PCs | 0.202 | 0.092 | 0.110 | 6.52e-11 | BMI | SAS |
PRS | PRS | 0.202 | 0.202 | 0.000 | 1.00e+00 | BMI | SAS |
PRS | All | 0.202 | 0.245 | -0.043 | 2.38e-11 | BMI | SAS |
All | PCs | 0.245 | 0.092 | 0.153 | 3.53e-34 | BMI | SAS |
All | PRS | 0.245 | 0.202 | 0.043 | 2.38e-11 | BMI | SAS |
All | All | 0.245 | 0.245 | 0.000 | 1.00e+00 | BMI | SAS |
PCs | PCs | 0.096 | 0.096 | 0.000 | 1.00e+00 | Height | SAS |
PCs | PRS | 0.096 | 0.230 | -0.135 | 2.59e-34 | Height | SAS |
PCs | All | 0.096 | 0.233 | -0.137 | 4.63e-32 | Height | SAS |
PRS | PCs | 0.230 | 0.096 | 0.135 | 2.59e-34 | Height | SAS |
PRS | PRS | 0.230 | 0.230 | 0.000 | 1.00e+00 | Height | SAS |
PRS | All | 0.230 | 0.233 | -0.003 | 2.92e-01 | Height | SAS |
All | PCs | 0.233 | 0.096 | 0.137 | 4.63e-32 | Height | SAS |
All | PRS | 0.233 | 0.230 | 0.003 | 2.92e-01 | Height | SAS |
All | All | 0.233 | 0.233 | 0.000 | 1.00e+00 | Height | SAS |
Show PC vs PRS residuals results
Model_1 | Model_2 | Model_1_R | Model_2_R | R_diff | R_diff_pval | Phenotype | Population |
---|---|---|---|---|---|---|---|
PCs | PCs | 0.051 | 0.051 | 0.000 | 1.00e+00 | BMI | AFR |
PCs | PRS.resid | 0.051 | 0.103 | -0.052 | 2.68e-03 | BMI | AFR |
PCs | All | 0.051 | 0.120 | -0.069 | 1.35e-08 | BMI | AFR |
PRS.resid | PCs | 0.103 | 0.051 | 0.052 | 2.68e-03 | BMI | AFR |
PRS.resid | PRS.resid | 0.103 | 0.103 | 0.000 | 1.00e+00 | BMI | AFR |
PRS.resid | All | 0.103 | 0.120 | -0.016 | 1.52e-02 | BMI | AFR |
All | PCs | 0.120 | 0.051 | 0.069 | 1.35e-08 | BMI | AFR |
All | PRS.resid | 0.120 | 0.103 | 0.016 | 1.52e-02 | BMI | AFR |
All | All | 0.120 | 0.120 | 0.000 | 1.00e+00 | BMI | AFR |
PCs | PCs | -0.003 | -0.003 | 0.000 | 1.00e+00 | Height | AFR |
PCs | PRS.resid | -0.003 | 0.135 | -0.138 | 1.08e-15 | Height | AFR |
PCs | All | -0.003 | 0.132 | -0.135 | 1.17e-16 | Height | AFR |
PRS.resid | PCs | 0.135 | -0.003 | 0.138 | 1.08e-15 | Height | AFR |
PRS.resid | PRS.resid | 0.135 | 0.135 | 0.000 | 1.00e+00 | Height | AFR |
PRS.resid | All | 0.135 | 0.132 | 0.003 | 1.49e-01 | Height | AFR |
All | PCs | 0.132 | -0.003 | 0.135 | 1.17e-16 | Height | AFR |
All | PRS.resid | 0.132 | 0.135 | -0.003 | 1.49e-01 | Height | AFR |
All | All | 0.132 | 0.132 | 0.000 | 1.00e+00 | Height | AFR |
PCs | PCs | 0.042 | 0.042 | 0.000 | 1.00e+00 | BMI | AMR |
PCs | PRS.resid | 0.042 | 0.167 | -0.126 | 1.42e-04 | BMI | AMR |
PCs | All | 0.042 | 0.162 | -0.121 | 1.61e-06 | BMI | AMR |
PRS.resid | PCs | 0.167 | 0.042 | 0.126 | 1.42e-04 | BMI | AMR |
PRS.resid | PRS.resid | 0.167 | 0.167 | 0.000 | 1.00e+00 | BMI | AMR |
PRS.resid | All | 0.167 | 0.162 | 0.005 | 6.54e-01 | BMI | AMR |
All | PCs | 0.162 | 0.042 | 0.121 | 1.61e-06 | BMI | AMR |
All | PRS.resid | 0.162 | 0.167 | -0.005 | 6.54e-01 | BMI | AMR |
All | All | 0.162 | 0.162 | 0.000 | 1.00e+00 | BMI | AMR |
PCs | PCs | 0.271 | 0.271 | 0.000 | 1.00e+00 | Height | AMR |
PCs | PRS.resid | 0.271 | 0.248 | 0.024 | 4.42e-01 | Height | AMR |
PCs | All | 0.271 | 0.370 | -0.099 | 5.08e-10 | Height | AMR |
PRS.resid | PCs | 0.248 | 0.271 | -0.024 | 4.42e-01 | Height | AMR |
PRS.resid | PRS.resid | 0.248 | 0.248 | 0.000 | 1.00e+00 | Height | AMR |
PRS.resid | All | 0.248 | 0.370 | -0.122 | 2.02e-12 | Height | AMR |
All | PCs | 0.370 | 0.271 | 0.099 | 5.08e-10 | Height | AMR |
All | PRS.resid | 0.370 | 0.248 | 0.122 | 2.02e-12 | Height | AMR |
All | All | 0.370 | 0.370 | 0.000 | 1.00e+00 | Height | AMR |
PCs | PCs | 0.167 | 0.167 | 0.000 | 1.00e+00 | BMI | EAS |
PCs | PRS.resid | 0.167 | 0.171 | -0.004 | 8.91e-01 | BMI | EAS |
PCs | All | 0.167 | 0.243 | -0.076 | 4.15e-05 | BMI | EAS |
PRS.resid | PCs | 0.171 | 0.167 | 0.004 | 8.91e-01 | BMI | EAS |
PRS.resid | PRS.resid | 0.171 | 0.171 | 0.000 | 1.00e+00 | BMI | EAS |
PRS.resid | All | 0.171 | 0.243 | -0.072 | 1.43e-05 | BMI | EAS |
All | PCs | 0.243 | 0.167 | 0.076 | 4.15e-05 | BMI | EAS |
All | PRS.resid | 0.243 | 0.171 | 0.072 | 1.43e-05 | BMI | EAS |
All | All | 0.243 | 0.243 | 0.000 | 1.00e+00 | BMI | EAS |
PCs | PCs | 0.104 | 0.104 | 0.000 | 1.00e+00 | Height | EAS |
PCs | PRS.resid | 0.104 | 0.188 | -0.084 | 9.60e-03 | Height | EAS |
PCs | All | 0.104 | 0.211 | -0.107 | 1.85e-06 | Height | EAS |
PRS.resid | PCs | 0.188 | 0.104 | 0.084 | 9.60e-03 | Height | EAS |
PRS.resid | PRS.resid | 0.188 | 0.188 | 0.000 | 1.00e+00 | Height | EAS |
PRS.resid | All | 0.188 | 0.211 | -0.023 | 6.34e-02 | Height | EAS |
All | PCs | 0.211 | 0.104 | 0.107 | 1.85e-06 | Height | EAS |
All | PRS.resid | 0.211 | 0.188 | 0.023 | 6.34e-02 | Height | EAS |
All | All | 0.211 | 0.211 | 0.000 | 1.00e+00 | Height | EAS |
PCs | PCs | 0.005 | 0.005 | 0.000 | 1.00e+00 | BMI | EUR |
PCs | PRS.resid | 0.005 | 0.271 | -0.266 | 0.00e+00 | BMI | EUR |
PCs | All | 0.005 | 0.271 | -0.266 | 0.00e+00 | BMI | EUR |
PRS.resid | PCs | 0.271 | 0.005 | 0.266 | 0.00e+00 | BMI | EUR |
PRS.resid | PRS.resid | 0.271 | 0.271 | 0.000 | 1.00e+00 | BMI | EUR |
PRS.resid | All | 0.271 | 0.271 | 0.000 | 4.69e-01 | BMI | EUR |
All | PCs | 0.271 | 0.005 | 0.266 | 0.00e+00 | BMI | EUR |
All | PRS.resid | 0.271 | 0.271 | 0.000 | 4.69e-01 | BMI | EUR |
All | All | 0.271 | 0.271 | 0.000 | 1.00e+00 | BMI | EUR |
PCs | PCs | 0.008 | 0.008 | 0.000 | 1.00e+00 | Height | EUR |
PCs | PRS.resid | 0.008 | 0.323 | -0.316 | 0.00e+00 | Height | EUR |
PCs | All | 0.008 | 0.323 | -0.316 | 0.00e+00 | Height | EUR |
PRS.resid | PCs | 0.323 | 0.008 | 0.316 | 0.00e+00 | Height | EUR |
PRS.resid | PRS.resid | 0.323 | 0.323 | 0.000 | 1.00e+00 | Height | EUR |
PRS.resid | All | 0.323 | 0.323 | 0.000 | 5.02e-01 | Height | EUR |
All | PCs | 0.323 | 0.008 | 0.316 | 0.00e+00 | Height | EUR |
All | PRS.resid | 0.323 | 0.323 | 0.000 | 5.02e-01 | Height | EUR |
All | All | 0.323 | 0.323 | 0.000 | 1.00e+00 | Height | EUR |
PCs | PCs | 0.092 | 0.092 | 0.000 | 1.00e+00 | BMI | SAS |
PCs | PRS.resid | 0.092 | 0.227 | -0.135 | 2.07e-17 | BMI | SAS |
PCs | All | 0.092 | 0.245 | -0.153 | 4.52e-34 | BMI | SAS |
PRS.resid | PCs | 0.227 | 0.092 | 0.135 | 2.07e-17 | BMI | SAS |
PRS.resid | PRS.resid | 0.227 | 0.227 | 0.000 | 1.00e+00 | BMI | SAS |
PRS.resid | All | 0.227 | 0.245 | -0.018 | 1.84e-05 | BMI | SAS |
All | PCs | 0.245 | 0.092 | 0.153 | 4.52e-34 | BMI | SAS |
All | PRS.resid | 0.245 | 0.227 | 0.018 | 1.84e-05 | BMI | SAS |
All | All | 0.245 | 0.245 | 0.000 | 1.00e+00 | BMI | SAS |
PCs | PCs | 0.096 | 0.096 | 0.000 | 1.00e+00 | Height | SAS |
PCs | PRS.resid | 0.096 | 0.212 | -0.116 | 2.08e-13 | Height | SAS |
PCs | All | 0.096 | 0.234 | -0.138 | 1.13e-31 | Height | SAS |
PRS.resid | PCs | 0.212 | 0.096 | 0.116 | 2.08e-13 | Height | SAS |
PRS.resid | PRS.resid | 0.212 | 0.212 | 0.000 | 1.00e+00 | Height | SAS |
PRS.resid | All | 0.212 | 0.234 | -0.022 | 1.75e-05 | Height | SAS |
All | PCs | 0.234 | 0.096 | 0.138 | 1.13e-31 | Height | SAS |
All | PRS.resid | 0.234 | 0.212 | 0.022 | 1.75e-05 | Height | SAS |
All | All | 0.234 | 0.234 | 0.000 | 1.00e+00 | Height | SAS |
I have performed GWAS for 17 traits in a training subset of EUR individuals in UKB. Perform polygenic scoring for each of these GWAS, and compares portability.
Show code
mkdir /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/TWAS
> /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/TWAS/todo.txt
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt); do
if [ ! -f /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/TWAS/${pheno}_withCOLOC/${pheno}_res_GW.txt ]; then
echo $pheno /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/GWAS/${pheno}/UKBB.w_hm3.QCd.AllSNP.${pheno}.GW.qassoc.clean.QC.sumstats.gz >> /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/TWAS/todo.txt
fi
done
sbatch -p brc,shared --mem 10G -n 4 --array 1-$(wc -l /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/TWAS/todo.txt | cut -d' ' -f1)%3 /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Run_TWAS/Run_TWAS_withCOLOC.sh \
--gwas_list /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/TWAS/todo.txt \
--pos /users/k1806347/brc_scratch/Data/TWAS_sumstats/FUSION/All_tissues.pos \
--outdir /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/TWAS \
--fusion_dir ${FUSION_dir} \
--ncores 4
GeRS
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Pipeline_prep.config
mkdir -p /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/GeRS
> /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/GeRS/todo.txt
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt); do
for weights in $(cat ${TWAS_rep}/snp_weight_list.txt);do
if [ ! -f /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/GeRS/${pheno}/1KGPhase3.w_hm3.EUR.FUSION.${pheno}.${weights}.scale ]; then
echo $pheno $weights >> /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/GeRS/todo.txt
fi
done
done
cat > /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/GeRS/score_sbatch.sh << 'EOF'
#!/bin/sh
#SBATCH -p brc,shared
#SBATCH -J GeRS
#SBATCH --mem 5G
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Pipeline_prep.config
pheno=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $1}' /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/GeRS/todo.txt)
weights=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $2}' /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/GeRS/todo.txt)
echo ${pheno}
echo ${weights}
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/ref_funtionally_informed_risk_scorer/ref_funtionally_informed_risk_scorer.R \
--twas_results /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/TWAS/${pheno}_withCOLOC/${pheno}_res_GW.txt \
--ref_feature_pred ${Geno_1KG_dir}/Predicted_expression/FUSION/${weights}/1KGPhase3.w_hm3.FUSION.${weights}.predictions.gz \
--output /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/GeRS/${pheno}/1KGPhase3.w_hm3.EUR.FUSION.${pheno}.${weights} \
--ref_keep ${Geno_1KG_dir}/keep_files/EUR_samples.keep \
--ref_scale ${Geno_1KG_dir}/Predicted_expression/FUSION/${weights}/1KGPhase3.w_hm3.FUSION.${weights}.EUR.scale \
--panel ${weights}
EOF
sbatch --array 1-$(wc -l /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/GeRS/todo.txt | cut -d' ' -f1)%40 /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/GeRS/score_sbatch.sh
pT + clump: Sparse
pop1=EUR
> /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/pt_clump/score_todo.txt
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt); do
if [ ! -f /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/pt_clump/${pheno}/1KGPhase3.w_hm3.${pop1}.pt_clump.${pheno}.EUR.scale ]; then
${pheno} >> /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/pt_clump/score_todo.txt
fi
done
cat > /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/pt_clump/score_sbatch.sh << 'EOF'
#!/bin/sh
#SBATCH -p shared,brc
#SBATCH -J pt_clump
pop1=EUR
pheno=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $1}' /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/pt_clump/score_todo.txt)
echo ${pop1}
echo ${pheno}
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/polygenic_score_file_creator/polygenic_score_file_creator.R \
--ref_plink_chr /users/k1806347/brc_scratch/Data/1KG/Phase3/1KGPhase3.w_hm3.chr \
--ref_keep /users/k1806347/brc_scratch/Data/1KG/Phase3/keep_files/${pop1}_samples.keep \
--sumstats /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/GWAS/${pheno}/UKBB.w_hm3.QCd.AllSNP.${pheno}.GW.qassoc.clean.QC.gz \
--plink /users/k1806347/brc_scratch/Software/plink1.9.sh \
--output /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/pt_clump/${pheno}/1KGPhase3.w_hm3.${pop1}.pt_clump.${pheno} \
--ref_pop_scale /users/k1806347/brc_scratch/Data/1KG/Phase3/super_pop_keep.list
EOF
sbatch --array 1-$(wc -l /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/pt_clump/score_todo.txt | cut -d' ' -f1)%20 /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/pt_clump/score_sbatch.sh
lassosum
pop1=EUR
mkdir -p /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/lassosum
> /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/lassosum/score_todo.txt
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt); do
if [ ! -f /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/lassosum/${pheno}/1KGPhase3.w_hm3.${pop1}.lassosum.${pheno}.EUR.scale ]; then
echo ${pheno} >> /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/lassosum/score_todo.txt
fi
done
cat > /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/lassosum/score_sbatch.sh << 'EOF'
#!/bin/sh
#SBATCH -p shared,brc
#SBATCH --mem 10G
#SBATCH -J lassosum
pop1=EUR
pheno=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $1}' /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/lassosum/score_todo.txt)
echo ${pop1}
echo ${pheno}
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/polygenic_score_file_creator_lassosum/polygenic_score_file_creator_lassosum.R \
--ref_plink_gw /users/k1806347/brc_scratch/Data/1KG/Phase3/1KGPhase3.w_hm3.GW \
--ref_keep /users/k1806347/brc_scratch/Data/1KG/Phase3/keep_files/${pop1}_samples.keep \
--sumstats /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/GWAS/${pheno}/UKBB.w_hm3.QCd.AllSNP.${pheno}.GW.qassoc.clean.QC.gz \
--plink /users/k1806347/brc_scratch/Software/plink1.9.sh \
--output /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/lassosum/${pheno}/1KGPhase3.w_hm3.${pop1}.lassosum.${pheno} \
--ref_pop_scale /users/k1806347/brc_scratch/Data/1KG/Phase3/super_pop_keep.list
EOF
sbatch --array 1-$(wc -l /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/lassosum/score_todo.txt | cut -d' ' -f1)%10 /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/lassosum/score_sbatch.sh
PRScs
pop1=EUR
mkdir -p /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/PRScs
> /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/PRScs/score_todo.txt
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt); do
if [ ! -f /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/PRScs/${pheno}/1KGPhase3.w_hm3.${pop1}.PRScs.${pheno}.EUR.scale ]; then
echo ${pheno} >> /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/PRScs/score_todo.txt
fi
done
cat > /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/PRScs/score_sbatch.sh << 'EOF'
#!/bin/sh
#SBATCH -p shared,brc
#SBATCH --mem 5G
#SBATCH -n 1
#SBATCH --nodes=1
#SBATCH -t 48:00:00
#SBATCH -J PRScs_batch
pop1=EUR
pheno=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $1}' /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/PRScs/score_todo.txt)
echo ${pop1}
echo ${pheno}
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/polygenic_score_file_creator_PRScs/polygenic_score_file_creator_PRScs_2.R \
--ref_plink_chr /users/k1806347/brc_scratch/Data/1KG/Phase3/1KGPhase3.w_hm3.chr \
--ref_keep /users/k1806347/brc_scratch/Data/1KG/Phase3/keep_files/${pop1}_samples.keep \
--sumstats /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/GWAS/${pheno}/UKBB.w_hm3.QCd.AllSNP.${pheno}.GW.qassoc.clean.QC.gz \
--plink /users/k1806347/brc_scratch/Software/plink1.9.sh \
--output /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/PRScs/${pheno}/1KGPhase3.w_hm3.${pop1}.PRScs.${pheno} \
--ref_pop_scale /users/k1806347/brc_scratch/Data/1KG/Phase3/super_pop_keep.list \
--PRScs_path /users/k1806347/brc_scratch/Software/PRScs.sh \
--PRScs_ref_path /users/k1806347/brc_scratch/Software/PRScs/ldblk_1kg_eur \
--n_cores 15 \
--phi_param 1e-6,1e-4,1e-2,1,auto
EOF
sbatch --array 1-$(wc -l /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/PRScs/score_todo.txt | cut -d' ' -f1)%2 /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/PRScs/score_sbatch.sh
SBayesR
pop1=EUR
mkdir -p /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/SBayesR
> /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/SBayesR/score_todo.txt
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt); do
if [ ! -f /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/SBayesR/${pheno}/1KGPhase3.w_hm3.${pop1}.SBayesR.${pheno}.EUR.scale ]; then
echo ${pheno} >> /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/SBayesR/score_todo.txt
fi
done
cat > /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/SBayesR/score_sbatch.sh << 'EOF'
#!/bin/sh
#SBATCH -p shared,brc
#SBATCH --mem 50G
#SBATCH -n 6
#SBATCH -J SBayesR
pop1=EUR
pheno=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $1}' /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/SBayesR/score_todo.txt)
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Pipeline_prep.config
echo ${pop1}
echo ${pheno}
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/polygenic_score_file_creator_SBayesR/polygenic_score_file_creator_SBayesR.R \
--ref_plink /users/k1806347/brc_scratch/Data/1KG/Phase3/1KGPhase3.w_hm3.GW \
--ref_keep /users/k1806347/brc_scratch/Data/1KG/Phase3/keep_files/${pop1}_samples.keep \
--sumstats /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/GWAS/${pheno}/UKBB.w_hm3.QCd.AllSNP.${pheno}.GW.qassoc.clean.QC.gz \
--plink /users/k1806347/brc_scratch/Software/plink1.9.sh \
--gctb ${gctb_203} \
--ld_matrix_chr /users/k1806347/brc_scratch/Data/1KG/Phase3/LD_matrix/GCTB_shared/ukbEURu_hm3_shrunk_sparse/ukbEURu_hm3_v3_50k_chr \
--output /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/SBayesR/${pheno}/1KGPhase3.w_hm3.${pop1}.SBayesR.${pheno} \
--ref_pop_scale /users/k1806347/brc_scratch/Data/1KG/Phase3/super_pop_keep.list \
--n_cores 6 \
--memory 50000
EOF
sbatch --array 1-$(wc -l /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/SBayesR/score_todo.txt | cut -d' ' -f1)%4 /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/SBayesR/score_sbatch.sh
LDPred2
pop1=EUR
mkdir -p /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/LDPred2
> /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/LDPred2/score_todo.txt
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt); do
if [ ! -f /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/LDPred2/${pheno}/1KGPhase3.w_hm3.${pop1}.LDPred2.${pheno}.EUR.scale ]; then
echo ${pheno} >> /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/LDPred2/score_todo.txt
fi
done
cat > /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/LDPred2/score_sbatch.sh << 'EOF'
#!/bin/sh
#SBATCH -p shared,brc
#SBATCH --mem 70G
#SBATCH -J LDPred2
#SBATCH -n 10
#SBATCH --nodes 1
#SBATCH -t 5-00:00:00
pop1=EUR
pheno=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $1}' /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/LDPred2/score_todo.txt)
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Pipeline_prep.config
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/polygenic_score_file_creator_LDPred2/polygenic_score_file_creator_LDPred2.R \
--ref_plink ${Geno_1KG_dir}/1KGPhase3.w_hm3.GW \
--ref_keep /users/k1806347/brc_scratch/Data/1KG/Phase3/keep_files/${pop1}_samples.keep \
--ldpred2_ref_dir ${Geno_1KG_dir}/LD_matrix/LDPred2 \
--sumstats /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/GWAS/${pheno}/UKBB.w_hm3.QCd.AllSNP.${pheno}.GW.qassoc.clean.QC.gz \
--plink ${plink1_9} \
--memory 20000 \
--n_cores 10 \
--output /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/LDPred2/${pheno}/1KGPhase3.w_hm3.${pop1}.LDPred2.${pheno} \
--ref_pop_scale ${Geno_1KG_dir}/super_pop_keep.list
EOF
sbatch --array 1-$(wc -l /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/LDPred2/score_todo.txt | cut -d' ' -f1)%3 /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/LDPred2/score_sbatch.sh
DBSLMM
pop1=EUR
mkdir -p /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/DBSLMM
> /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/DBSLMM/score_todo.txt
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt); do
if [ ! -f /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/DBSLMM/${pheno}/1KGPhase3.w_hm3.${pop1}.DBSLMM.${pheno}.EUR.scale ]; then
echo ${pheno} >> /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/DBSLMM/score_todo.txt
fi
done
cat > /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/DBSLMM/score_sbatch.sh << 'EOF'
#!/bin/sh
#SBATCH -p shared,brc
#SBATCH --mem 10G
#SBATCH -n 2
#SBATCH --nodes 1
#SBATCH -J DBSLMM
pop1=EUR
pheno=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $1}' /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/DBSLMM/score_todo.txt)
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Pipeline_prep.config
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/polygenic_score_file_creator_DBSLMM/polygenic_score_file_creator_DBSLMM.R \
--ref_plink_chr ${Geno_1KG_dir}/1KGPhase3.w_hm3.chr \
--ref_keep /users/k1806347/brc_scratch/Data/1KG/Phase3/keep_files/${pop1}_samples.keep \
--sumstats /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/GWAS/${pheno}/UKBB.w_hm3.QCd.AllSNP.${pheno}.GW.qassoc.clean.QC.gz \
--plink ${plink1_9} \
--memory 5000 \
--ld_blocks /users/k1806347/brc_scratch/Data/LDetect/EUR \
--rscript /users/k1806347/brc_scratch/Software/Rscript.sh \
--dbslmm /users/k1806347/brc_scratch/Software/DBSLMM/software \
--munge_sumstats ${munge_sumstats} \
--ldsc ${ldsc} \
--ldsc_ref ${ldsc_ref} \
--hm3_snplist ${HapMap3_snplist_dir}/w_hm3.snplist \
--sample_prev NA \
--pop_prev NA \
--output /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/DBSLMM/${pheno}/1KGPhase3.w_hm3.${pop1}.DBSLMM.${pheno} \
--ref_pop_scale ${Geno_1KG_dir}/super_pop_keep.list
EOF
sbatch --array 1-$(wc -l /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/DBSLMM/score_todo.txt | cut -d' ' -f1)%3 /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/DBSLMM/score_sbatch.sh
GeRS
mkdir -p /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/GeRS/
pop1=EUR
> /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/GeRS/score_todo.txt
for pop2 in $(echo EUR_test EAS AMR AFR SAS); do
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt); do
for weights in $(cat ~/brc_scratch/Data/TWAS_sumstats/FUSION/snp_weight_list.txt);do
if [ ! -f /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/GeRS/${pheno}/UKBB.w_hm3.${pop2}.${pheno}.${weights}.fiprofile ]; then
echo ${pop2} ${pheno} ${weights} >> /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/GeRS/score_todo.txt
fi
done
done
done
cat > /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/GeRS/score_sbatch.sh << 'EOF'
#!/bin/sh
#SBATCH -p brc,shared
#SBATCH -J GeRS
#SBATCH --mem 15G
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
pop1=EUR
pop2=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $1}' /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/GeRS/score_todo.txt)
pheno=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $2}' /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/GeRS/score_todo.txt)
weights=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $3}' /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/GeRS/score_todo.txt)
pop2_short=${pop2}
if echo "$pop2" | grep -q "EUR_test"; then
pop2_short=EUR
fi
echo ${pop1}
echo ${pop2}
echo ${pop2_short}
echo ${pheno}
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/scaled_functionally_informed_risk_scorer/scaled_functionally_informed_risk_scorer.R \
--targ_feature_pred ${UKBB_output}/Predicted_expression/FUSION/${pop2}/${weights}/UKBB.w_hm3.QCd.AllSNP.FUSION.${weights}.predictions.gz \
--target_keep /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/Residuals/${pop2}/${pheno}_resid.pheno \
--ref_score /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/GeRS/${pheno}/1KGPhase3.w_hm3.EUR.FUSION.${pheno}.${weights}.score \
--ref_scale /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/GeRS/${pheno}/1KGPhase3.w_hm3.EUR.FUSION.${pheno}.${weights}.scale \
--pheno_name ${pheno} \
--n_cores 1 \
--pigz ${pigz_binary} \
--output /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/GeRS/${pheno}/UKBB.w_hm3.${pop2}.${pheno}.${weights}
EOF
sbatch --array 1-$(wc -l /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/GeRS/score_todo.txt | cut -d' ' -f1)%20 /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/GeRS/score_sbatch.sh
# Note, standardise all predictors when running model_builder as GeRS are scaled to EUR population. But gene expression is standardised.
pT + clump: Sparse
pop1=EUR
> /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/pt_clump/score_todo.txt
for pop2 in $(echo EUR_test EAS AMR AFR SAS); do
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt); do
if [ ! -f /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/pt_clump/${pop2}/${pheno}/UKBB.${pop2}.w_hm3.${pheno}.profiles ]; then
echo ${pop2} ${pheno} >> /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/pt_clump/score_todo.txt
fi
done
done
cat > /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/pt_clump/score_sbatch.sh << 'EOF'
#!/bin/sh
#SBATCH -p brc,shared
#SBATCH -J pt_clump
pop1=EUR
pop2=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $1}' /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/pt_clump/score_todo.txt)
pheno=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $2}' /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/pt_clump/score_todo.txt)
pop2_short=${pop2}
if echo "$pop2" | grep -q "EUR_test"; then
pop2_short=EUR
fi
echo ${pop1}
echo ${pop2}
echo ${pop2_short}
echo ${pheno}
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Scaled_polygenic_scorer/Scaled_polygenic_scorer.R \
--target_plink_chr /users/k1806347/brc_scratch/Data/UKBB/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr \
--target_keep /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/Residuals/${pop2}/${pheno}_resid.pheno \
--ref_score /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/pt_clump/${pheno}/1KGPhase3.w_hm3.${pop1}.pt_clump.${pheno} \
--ref_scale /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/pt_clump/${pheno}/1KGPhase3.w_hm3.${pop1}.pt_clump.${pheno}.${pop2_short}.scale \
--ref_freq_chr /users/k1806347/brc_scratch/Data/1KG/Phase3/freq_files/${pop2_short}/1KGPhase3.w_hm3.${pop2_short}.chr \
--plink /users/k1806347/brc_scratch/Software/plink1.9.sh \
--output /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/pt_clump/${pop2}/${pheno}/UKBB.${pop2}.w_hm3.${pheno}
EOF
sbatch --array 1-$(wc -l /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/pt_clump/score_todo.txt | cut -d' ' -f1)%20 /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/pt_clump/score_sbatch.sh
lassosum
pop1=EUR
> /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/lassosum/score_todo.txt
for pop2 in $(echo EUR_test EAS AMR AFR SAS); do
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt); do
if [ ! -f /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/lassosum/${pop2}/${pheno}/UKBB.${pop2}.w_hm3.${pheno}.lassosum_profiles ]; then
echo ${pop2} ${pheno} >> /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/lassosum/score_todo.txt
fi
done
done
cat > /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/lassosum/score_sbatch.sh << 'EOF'
#!/bin/sh
#SBATCH -p brc,shared
#SBATCH -J lassosum
#SBATCH -n 5
#SBATCH --mem 5G
pop1=EUR
pop2=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $1}' /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/lassosum/score_todo.txt)
pheno=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $2}' /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/lassosum/score_todo.txt)
pop2_short=${pop2}
if echo "$pop2" | grep -q "EUR_test"; then
pop2_short=EUR
fi
echo ${pop1}
echo ${pop2}
echo ${pop2_short}
echo ${pheno}
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Scaled_polygenic_scorer_lassosum/Scaled_polygenic_scorer_lassosum.R \
--target_plink_chr /users/k1806347/brc_scratch/Data/UKBB/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr \
--target_keep /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/Residuals/${pop2}/${pheno}_resid.pheno \
--ref_score /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/lassosum/${pheno}/1KGPhase3.w_hm3.${pop1}.lassosum.${pheno} \
--ref_scale /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/lassosum/${pheno}/1KGPhase3.w_hm3.${pop1}.lassosum.${pheno}.${pop2_short}.scale \
--ref_freq_chr /users/k1806347/brc_scratch/Data/1KG/Phase3/freq_files/${pop2_short}/1KGPhase3.w_hm3.${pop2_short}.chr \
--n_cores 5 \
--pheno_name ${pheno} \
--plink /users/k1806347/brc_scratch/Software/plink1.9.sh \
--output /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/lassosum/${pop2}/${pheno}/UKBB.${pop2}.w_hm3.${pheno}
EOF
sbatch --array 1-$(wc -l /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/lassosum/score_todo.txt | cut -d' ' -f1)%6 /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/lassosum/score_sbatch.sh
PRScs
pop1=EUR
> /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/PRScs/score_todo.txt
for pop2 in $(echo EUR_test EAS AMR AFR SAS); do
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt); do
if [ ! -f /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/PRScs/${pop2}/${pheno}/UKBB.${pop2}.w_hm3.${pheno}.PRScs_profiles ]; then
echo ${pop2} ${pheno} >> /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/PRScs/score_todo.txt
fi
done
done
cat > /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/PRScs/score_sbatch.sh << 'EOF'
#!/bin/sh
#SBATCH -p brc,shared
#SBATCH -J PRScs
#SBATCH -n 1
#SBATCH --mem 5G
pop1=EUR
pop2=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $1}' /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/PRScs/score_todo.txt)
pheno=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $2}' /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/PRScs/score_todo.txt)
pop2_short=${pop2}
if echo "$pop2" | grep -q "EUR_test"; then
pop2_short=EUR
fi
echo ${pop1}
echo ${pop2}
echo ${pop2_short}
echo ${pheno}
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Scaled_polygenic_scorer_PRScs/Scaled_polygenic_scorer_PRScs.R \
--target_plink_chr /users/k1806347/brc_scratch/Data/UKBB/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr \
--target_keep /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/Residuals/${pop2}/${pheno}_resid.pheno \
--ref_score /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/PRScs/${pheno}/1KGPhase3.w_hm3.${pop1}.PRScs.${pheno} \
--ref_scale /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/PRScs/${pheno}/1KGPhase3.w_hm3.${pop1}.PRScs.${pheno}.${pop2_short}.scale \
--ref_freq_chr /users/k1806347/brc_scratch/Data/1KG/Phase3/freq_files/${pop2_short}/1KGPhase3.w_hm3.${pop2_short}.chr \
--pheno_name ${pheno} \
--plink /users/k1806347/brc_scratch/Software/plink1.9.sh \
--output /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/PRScs/${pop2}/${pheno}/UKBB.${pop2}.w_hm3.${pheno}
EOF
sbatch --array 1-$(wc -l /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/PRScs/score_todo.txt | cut -d' ' -f1)%6 /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/PRScs/score_sbatch.sh
SBayesR
pop1=EUR
mkdir /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/SBayesR
> /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/SBayesR/score_todo.txt
for pop2 in $(echo EUR_test EAS AMR AFR SAS); do
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt); do
if [ ! -f /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/SBayesR/${pop2}/${pheno}/UKBB.${pop2}.w_hm3.${pheno}.SBayesR_profiles ]; then
echo ${pop2} ${pheno} >> /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/SBayesR/score_todo.txt
fi
done
done
cat > /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/SBayesR/score_sbatch.sh << 'EOF'
#!/bin/sh
#SBATCH -p brc,shared
#SBATCH -J SBayesR
#SBATCH --mem 10G
pop1=EUR
pop2=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $1}' /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/SBayesR/score_todo.txt)
pheno=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $2}' /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/SBayesR/score_todo.txt)
pop2_short=${pop2}
if echo "$pop2" | grep -q "EUR_test"; then
pop2_short=EUR
fi
echo ${pop1}
echo ${pop2}
echo ${pop2_short}
echo ${pheno}
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Scaled_polygenic_scorer_SBayesR/Scaled_polygenic_scorer_SBayesR.R \
--target_plink_chr /users/k1806347/brc_scratch/Data/UKBB/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr \
--target_keep /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/Residuals/${pop2}/${pheno}_resid.pheno \
--ref_score /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/SBayesR/${pheno}/GWAS_sumstats_SBayesR.GW.snpRes \
--ref_scale /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/SBayesR/${pheno}/1KGPhase3.w_hm3.${pop1}.SBayesR.${pheno}.${pop2_short}.scale \
--ref_freq_chr /users/k1806347/brc_scratch/Data/1KG/Phase3/freq_files/${pop2_short}/1KGPhase3.w_hm3.${pop2_short}.chr \
--pheno_name ${pheno} \
--plink /users/k1806347/brc_scratch/Software/plink1.9.sh \
--output /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/SBayesR/${pop2}/${pheno}/UKBB.${pop2}.w_hm3.${pheno}
EOF
sbatch --array 1-$(wc -l /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/SBayesR/score_todo.txt | cut -d' ' -f1)%6 /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/SBayesR/score_sbatch.sh
LDPred2
pop1=EUR
> /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/LDPred2/score_todo.txt
for pop2 in $(echo EUR_test EAS AMR AFR SAS); do
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt); do
if [ ! -f /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/LDPred2/${pop2}/${pheno}/UKBB.${pop2}.w_hm3.${pheno}.LDPred_profiles ]; then
echo ${pop2} ${pheno} >> /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/LDPred2/score_todo.txt
fi
done
done
cat > /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/LDPred2/score_sbatch.sh << 'EOF'
#!/bin/sh
#SBATCH -p brc,shared
#SBATCH -J LDPred2
#SBATCH -n 6
#SBATCH --mem 5G
pop1=EUR
pop2=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $1}' /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/LDPred2/score_todo.txt)
pheno=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $2}' /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/LDPred2/score_todo.txt)
pop2_short=${pop2}
if echo "$pop2" | grep -q "EUR_test"; then
pop2_short=EUR
fi
echo ${pop1}
echo ${pop2}
echo ${pop2_short}
echo ${pheno}
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Scaled_polygenic_scorer_LDPred2/Scaled_polygenic_scorer_LDPred2.R \
--target_plink_chr /users/k1806347/brc_scratch/Data/UKBB/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr \
--target_keep /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/Residuals/${pop2}/${pheno}_resid.pheno \
--ref_score /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/LDPred2/${pheno}/1KGPhase3.w_hm3.${pop1}.LDPred2.${pheno} \
--ref_scale /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/LDPred2/${pheno}/1KGPhase3.w_hm3.${pop1}.LDPred2.${pheno}.${pop2_short}.scale \
--ref_freq_chr /users/k1806347/brc_scratch/Data/1KG/Phase3/freq_files/${pop2_short}/1KGPhase3.w_hm3.${pop2_short}.chr \
--n_cores 6 \
--pheno_name ${pheno} \
--plink /users/k1806347/brc_scratch/Software/plink1.9.sh \
--output /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/LDPred2/${pop2}/${pheno}/UKBB.${pop2}.w_hm3.${pheno}
EOF
sbatch --array 1-$(wc -l /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/LDPred2/score_todo.txt | cut -d' ' -f1)%6 /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/LDPred2/score_sbatch.sh
DBSLMM
pop1=EUR
mkdir /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/DBSLMM
> /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/DBSLMM/score_todo.txt
for pop2 in $(echo EUR_test EAS AMR AFR SAS); do
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt); do
if [ ! -f /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/DBSLMM/${pop2}/${pheno}/UKBB.${pop2}.w_hm3.${pheno}.DBSLMM_profiles ]; then
echo ${pop2} ${pheno} >> /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/DBSLMM/score_todo.txt
fi
done
done
cat > /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/DBSLMM/score_sbatch.sh << 'EOF'
#!/bin/sh
#SBATCH -p brc,shared
#SBATCH -J DBSLMM
#SBATCH --mem 10G
pop1=EUR
pop2=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $1}' /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/DBSLMM/score_todo.txt)
pheno=$(awk -v var="$SLURM_ARRAY_TASK_ID" 'NR == var {print $2}' /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/DBSLMM/score_todo.txt)
pop2_short=${pop2}
if echo "$pop2" | grep -q "EUR_test"; then
pop2_short=EUR
fi
echo ${pop1}
echo ${pop2}
echo ${pop2_short}
echo ${pheno}
/users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Scaled_polygenic_scorer_DBSLMM/Scaled_polygenic_scorer_DBSLMM.R \
--target_plink_chr /users/k1806347/brc_scratch/Data/UKBB/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr \
--target_keep /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/Residuals/${pop2}/${pheno}_resid.pheno \
--ref_score /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/DBSLMM/${pheno}/1KGPhase3.w_hm3.${pop1}.DBSLMM.${pheno}.dbslmm.GW.txt \
--ref_scale /users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/DBSLMM/${pheno}/1KGPhase3.w_hm3.${pop1}.DBSLMM.${pheno}.${pop2_short}.scale \
--ref_freq_chr /users/k1806347/brc_scratch/Data/1KG/Phase3/freq_files/${pop2_short}/1KGPhase3.w_hm3.${pop2_short}.chr \
--pheno_name ${pheno} \
--plink /users/k1806347/brc_scratch/Software/plink1.9.sh \
--output /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/DBSLMM/${pop2}/${pheno}/UKBB.${pop2}.w_hm3.${pheno}
EOF
sbatch --array 1-$(wc -l /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/DBSLMM/score_todo.txt | cut -d' ' -f1)%6 /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/DBSLMM/score_sbatch.sh
pT + clump comparison
##############################
# Evaluating predictive utility of pT + clump PRSs across multiple pTs individually and in combination
##############################
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt);do
mkdir -p /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRSs
for pop in $(echo AFR AMR EAS EUR_test SAS);do
cat > /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRSs/UKBB.${pop}.w_hm3.${pheno}.EUR-PRSs.predictor_groups <<EOF
predictors
${UKBB_output}/DiverseAncestry/1KG_ref/pt_clump/${pop}/${pheno}/UKBB.${pop}.w_hm3.${pheno}.profiles
EOF
done
done
# pT + clump (sparse)
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt);do
for pop in $(echo AFR AMR EAS EUR_test SAS);do
sbatch --mem 10G -n 1 -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Model_builder/Model_builder_V2_nested.R \
--pheno /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/Residuals/${pop}/${pheno}_resid.pheno \
--out /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRSs/UKBB.${pop}.w_hm3.${pheno}.EUR-PRSs \
--n_core 1 \
--compare_predictors T \
--assoc T \
--predictors /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRSs/UKBB.${pop}.w_hm3.${pheno}.EUR-PRSs.predictor_groups
done
done
Comparison of PRS methods
##############################
# Evaluating predictive utility of pT + clump PRSs across multiple pTs individually and in combination
##############################
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt);do
for pop in $(echo AFR AMR EAS EUR_test SAS);do
cat > /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRSs/UKBB.${pop}.w_hm3.${pheno}.EUR-PRSs.AllMethodComp.predictor_groups <<EOF
predictors group
${UKBB_output}/DiverseAncestry/1KG_ref/pt_clump/${pop}/${pheno}/UKBB.${pop}.w_hm3.${pheno}.profiles pt_clump
${UKBB_output}/DiverseAncestry/1KG_ref/lassosum/${pop}/${pheno}/UKBB.${pop}.w_hm3.${pheno}.lassosum_profiles lassosum
${UKBB_output}/DiverseAncestry/1KG_ref/PRScs/${pop}/${pheno}/UKBB.${pop}.w_hm3.${pheno}.PRScs_profiles PRScs
${UKBB_output}/DiverseAncestry/1KG_ref/SBayesR/${pop}/${pheno}/UKBB.${pop}.w_hm3.${pheno}.SBayesR_profiles SBayesR
${UKBB_output}/DiverseAncestry/1KG_ref/LDPred2/${pop}/${pheno}/UKBB.${pop}.w_hm3.${pheno}.LDPred_profiles LDPred2
${UKBB_output}/DiverseAncestry/1KG_ref/DBSLMM/${pop}/${pheno}/UKBB.${pop}.w_hm3.${pheno}.DBSLMM_profiles DBSLMM
EOF
done
done
# Derive and evaluate models
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt);do
for pop in $(echo AFR AMR EAS EUR_test SAS);do
sbatch --mem 20G -n 1 -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Model_builder/Model_builder_V2_nested.R \
--pheno /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/Residuals/${pop}/${pheno}_resid.pheno \
--out /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRSs/UKBB.${pop}.w_hm3.${pheno}.EUR-PRSs.AllMethodComp \
--n_core 1 \
--compare_predictors T \
--assoc T \
--outcome_pop_prev NA \
--predictors /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRSs/UKBB.${pop}.w_hm3.${pheno}.EUR-PRSs.AllMethodComp.predictor_groups
done
done
GeRS + PRS
##############################
# Evaluating predictive utility of GeRS and PRS individually and in combination
##############################
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Make required directories
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt);do
mkdir -p /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRS_and_GeRSs
done
# Create a file listing the predictors files
weights=$(cat ${TWAS_rep}/snp_weight_list.txt)
for pheno in $(head -n 5 /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt);do
for pop in $(echo AFR AMR EAS EUR_test SAS);do
echo "predictors group" > /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.${pheno}.${pop}-GeRSs.${pop}-PRSs.pt_clump.predictor_groups
for weight in ${weights}; do
if [ -f /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/GeRS/${pheno}/UKBB.w_hm3.${pop}.${pheno}.${weight}.fiprofile ]; then
echo /users/k1806347/brc_scratch/Data/UKBB/DiverseAncestry/1KG_ref/GeRS/${pheno}/UKBB.w_hm3.${pop}.${pheno}.${weight}.fiprofile GeRS >> /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.${pheno}.${pop}-GeRSs.${pop}-PRSs.pt_clump.predictor_groups
fi
done
echo ${UKBB_output}/DiverseAncestry/1KG_ref/pt_clump/${pop}/${pheno}/UKBB.${pop}.w_hm3.${pheno}.profiles PRS >> /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.${pheno}.${pop}-GeRSs.${pop}-PRSs.pt_clump.predictor_groups
done
done
# Derive and evaluate models
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt);do
for pop in $(echo AFR AMR EAS EUR_test SAS);do
sbatch --mem 10G -n 1 -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Model_builder/Model_builder_V2_nested.R \
--pheno /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/Residuals/${pop}/${pheno}_resid.pheno \
--out /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.${pheno}.${pop}-GeRSs.${pop}-PRSs.pt_clump \
--n_core 1 \
--compare_predictors F \
--assoc T \
--predictors /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.${pheno}.${pop}-GeRSs.${pop}-PRSs.pt_clump.predictor_groups
done
done
PCs with PRS residuals
Rather than modifying the model builder script. Calculate PRS and PC interactions in advance.
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config')
dir.create(paste0(UKBB_output,'/DiverseAncestry/1KG_ref/PRS_residuals'))
pheno_list<-read.table('/users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt')$V1
library(data.table)
for(pheno in pheno_list){
for(pop in c('EUR','AFR','SAS','EAS','AMR')){
if(pop == 'EUR'){
pop2<-paste0(pop,'_test')
} else {
pop2<-pop
}
# Read in prs and PC data
pcs<-fread(paste0(UKBB_output,'/DiverseAncestry/1KG_ref/PCs/UKBB.Ancestry.',pop,'.UpdateIDs.eigenvec'))
prs<-fread(paste0(UKBB_output,'/DiverseAncestry/1KG_ref/pt_clump/',pop2,'/',pheno,'/UKBB.',pop2,'.w_hm3.',pheno,'.profiles'))
names(prs)[-1:-2]<-paste0('PRS',1:(dim(prs)[2]-2))
both<-merge(prs, pcs, by=c('FID','IID'))
both_resid<-both[,grepl('FID|IID|PRS', names(both)), with=F]
for(i in names(prs[,-1:-2])){
both_resid[[i]]<-as.numeric(scale(resid(lm(as.formula(paste0(i,'~',paste0(names(pcs[,-1:-2]), collapse=' + '))),data=both))))
}
fwrite(both_resid, paste0(UKBB_output,'/DiverseAncestry/1KG_ref/PRS_residuals/UKBB.',pheno,'.',pop2,'.prs_residuals.txt'), quote=F, na='NA', sep=' ')
}
}
##############################
# Evaluating predictive utility of PCs in combination with PRS
##############################
. /users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config
# Make required directories
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt);do
mkdir -p /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRS_and_PCs
done
# Create a file listing the predictors files
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt);do
for pop in $(echo AFR AMR EAS EUR SAS);do
if [ ${pop} == 'EUR' ]
then
pop2=EUR_test
else
pop2=${pop}
fi
echo "predictors group" > /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRS_and_PCs/UKBB.w_hm3.${pheno}.${pop2}-PCs-PRSs.pt_clump-residuals.predictor_groups
echo ${UKBB_output}/DiverseAncestry/1KG_ref/PCs/UKBB.Ancestry.${pop}.UpdateIDs.eigenvec PCs >> /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRS_and_PCs/UKBB.w_hm3.${pheno}.${pop2}-PCs-PRSs.pt_clump-residuals.predictor_groups
echo ${UKBB_output}/DiverseAncestry/1KG_ref/PRS_residuals/UKBB.${pheno}.${pop2}.prs_residuals.txt PRS_resid >> /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRS_and_PCs/UKBB.w_hm3.${pheno}.${pop2}-PCs-PRSs.pt_clump-residuals.predictor_groups
done
done
# Derive and evaluate models
for pheno in $(cat /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt);do
for pop in $(echo AFR AMR EAS EUR_test SAS);do
sbatch --mem 10G -n 1 -p brc,shared /users/k1806347/brc_scratch/Software/Rscript.sh /users/k1806347/brc_scratch/Software/MyGit/GenoPred/Scripts/Model_builder/Model_builder_V2_nested.R \
--pheno /users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/Residuals/${pop}/${pheno}_resid.pheno \
--out /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRS_and_PCs/UKBB.w_hm3.${pheno}.${pop}-PCs-PRSs.pt_clump-residuals \
--n_core 1 \
--compare_predictors F \
--assoc T \
--outcome_pop_prev NA \
--predictors /users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/${pheno}/Association_withPRS_and_PCs/UKBB.w_hm3.${pheno}.${pop}-PCs-PRSs.pt_clump-residuals.predictor_groups
done
done
Residualising PRS in advance makes very little difference to the all model, but residualised PRS generally explain more variance in non-EUR populations. I think as suggested by the Alkes price paper, residualising in advance is a good idea, and also disentangles the population stratification in PRS from PCs.
pT + clump comparison
library(data.table)
pop<-c('AFR','AMR','EAS','EUR_test','SAS')
pheno<-fread('/users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt', header=F)$V1
res<-list()
for(i in 1:length(pheno)){
res_pheno<-NULL
for(k in 1:length(pop)){
tmp<-fread(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRSs/UKBB.',pop[k],'.w_hm3.',pheno[i],'.EUR-PRSs.pred_eval.txt'))
tmp2<-data.frame( Population=gsub('_test','',pop[k]),
Phenotype=pheno[i],
tmp)
res_pheno<-rbind(res_pheno, tmp2)
}
res_pheno$Model<-gsub('e.','e-',gsub('_.*','',gsub('SCORE.','',res_pheno$Model)))
res_pheno$Model<-factor(res_pheno$Model, levels=unique(res_pheno$Model))
res[[pheno[i]]]<-res_pheno
}
library(ggplot2)
library(cowplot)
plot_list<-NULL
for(i in 1:length(pheno)){
plot_list[[pheno[i]]]<-ggplot(res[[pheno[i]]], aes(x=factor(Model), y=R, colour=Population)) +
geom_point(stat="identity", position=position_dodge( width=.25)) +
geom_errorbar(aes(ymin=R-SE, ymax=R+SE), width=.2, position=position_dodge(.25)) +
theme_half_open() +
theme(axis.text.x = element_text(angle = 55, vjust = 1, hjust=1), plot.title = element_text(hjust = 0.4, size=12)) +
background_grid(major = 'y', minor = 'y') +
labs(x='pT',y="Correlation (SE)", title=pheno[i])
}
png('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/UKB.Diverse.internal.pTclump_per_pT.png', units='px', res=300, width=3000, height=10000)
plot_grid(plotlist=plot_list, ncol = 2)
dev.off()
Show pT + clump results
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config')
pheno<-fread('/users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt', header=F)$V1
pheno<-pheno[1:5]
pop<-c('AFR','AMR','EAS','EUR_test','SAS')
##################
# Organise, make tables and figures for per phenotype results
##################
res_eval<-NULL
res_comp<-NULL
res_eval_plots<-list()
res_eval_diff_plots<-list()
res_eval_diff_matrix<-list()
j<-1
for(i in 1:length(pheno)){
for(k in 1:length(pop)){
res_eval_i<-read.table(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRSs/UKBB.',pop[k],'.w_hm3.',pheno[i],'.EUR-PRSs.AllMethodComp.pred_eval.txt'), header=T, stringsAsFactors=F)
res_comp_i<-read.table(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRSs/UKBB.',pop[k],'.w_hm3.',pheno[i],'.EUR-PRSs.AllMethodComp.pred_comp.txt'), header=T, stringsAsFactors=F)
# Idenitfy model names for 10FCVal, multi-PRS and pseudoval
selected_models<-NULL
##
# pT+clump (sparse)
##
# Identify best pT in 10FCVal
res_eval_i_per_pT<-res_eval_i[grepl('pt.clump.PredFile', res_eval_i$Model),]
selected_models<-rbind(selected_models,data.frame(Test='pT+clump.10FCVal', Label=
res_eval_i_per_pT[res_eval_i_per_pT$R == max(res_eval_i_per_pT$R),]$Model[1]))
# Identify multi pT
selected_models<-rbind(selected_models,data.frame(Test='pT+clump.MultiPRS', Label='pt.clump'))
##
# lassosum
##
# Identify best pT in 10FCVal
res_eval_i_per_lassosum<-res_eval_i[grepl('lassosum.PredFile', res_eval_i$Model),]
selected_models<-rbind(selected_models,data.frame(Test='lassosum.10FCVal', Label=
res_eval_i_per_lassosum[res_eval_i_per_lassosum$R == max(res_eval_i_per_lassosum$R),]$Model[1]))
# Identify multi pT
selected_models<-rbind(selected_models,data.frame(Test='lassosum.MultiPRS', Label='lassosum'))
# Identify pseudoVal
lassosum_val<-read.table(paste0('/users/k1806347/brc_scratch/Data/1KG/Phase3/DiverseAncestry/score_files/lassosum/',pheno[i],'/1KGPhase3.w_hm3.EUR.lassosum.',pheno[i],'.log'), sep='&')
lassosum_val_s<-as.numeric(gsub('s = ','',lassosum_val$V1[grepl('s = ',lassosum_val$V1)]))
lassosum_val_lambda<-as.numeric(gsub('lambda =','',lassosum_val$V1[grepl('lambda =',lassosum_val$V1)]))
lassosum_val_param<-paste0('s',lassosum_val_s,'.lambda',lassosum_val_lambda)
lassosum_val_param<-substr(lassosum_val_param, 1, nchar(lassosum_val_param)-1)
selected_models<-rbind(selected_models,data.frame(Test='lassosum.PseudoVal', Label=res_eval_i[grepl(lassosum_val_param, res_eval_i$Model),]$Model[1]))
##
# PRScs
##
# Identify best pT in 10FCVal
res_eval_i_per_PRScs<-res_eval_i[grepl('PRScs.PredFile', res_eval_i$Model),]
res_eval_i_per_PRScs<-res_eval_i_per_PRScs[!grepl('phiauto', res_eval_i_per_PRScs$Model),]
selected_models<-rbind(selected_models,data.frame(Test='PRScs.10FCVal', Label=res_eval_i_per_PRScs[res_eval_i_per_PRScs$R == max(res_eval_i_per_PRScs$R),]$Model[1]))
# Identify multi pT
selected_models<-rbind(selected_models,data.frame(Test='PRScs.MultiPRS', Label='PRScs'))
# Identify pseudoval
res_eval_i_per_PRScs<-res_eval_i[grepl('PRScs.PredFile', res_eval_i$Model),]
res_eval_i_per_PRScs<-res_eval_i_per_PRScs[grepl('phiauto', res_eval_i_per_PRScs$Model),]
selected_models<-rbind(selected_models,data.frame(Test='PRScs.PseudoVal', Label=res_eval_i_per_PRScs[res_eval_i_per_PRScs$R == max(res_eval_i_per_PRScs$R),]$Model[1]))
##
# SBayesR
##
# Identify PseudoVal
selected_models<-rbind(selected_models,data.frame(Test='SBayesR.PseudoVal', Label='SBayesR'))
##
# LDPred2
##
# Identify best pT in 10FCVal
res_eval_i_per_LDPred2<-res_eval_i[grepl('LDPred2.PredFile', res_eval_i$Model),]
res_eval_i_per_LDPred2<-res_eval_i_per_LDPred2[!grepl('beta.inf|beta.auto', res_eval_i_per_LDPred2$Model),]
selected_models<-rbind(selected_models,data.frame(Test='LDPred2.10FCVal', Label=res_eval_i_per_LDPred2[res_eval_i_per_LDPred2$R == max(res_eval_i_per_LDPred2$R),]$Model[1]))
# Identify multi pT
selected_models<-rbind(selected_models,data.frame(Test='LDPred2.MultiPRS', Label='LDPred2'))
# Identify pseudoval
res_eval_i_per_LDPred2<-res_eval_i[grepl('LDPred2.PredFile', res_eval_i$Model),]
res_eval_i_per_LDPred2<-res_eval_i_per_LDPred2[grepl('beta.inf', res_eval_i_per_LDPred2$Model),]
selected_models<-rbind(selected_models,data.frame(Test='LDPred2.Inf', Label=res_eval_i_per_LDPred2$Model))
res_eval_i_per_LDPred2<-res_eval_i[grepl('LDPred2.PredFile', res_eval_i$Model),]
res_eval_i_per_LDPred2<-res_eval_i_per_LDPred2[grepl('beta.auto', res_eval_i_per_LDPred2$Model),]
selected_models<-rbind(selected_models,data.frame(Test='LDPred2.PseudoVal', Label=res_eval_i_per_LDPred2$Model))
##
# DBSLMM
##
# Identify PseudoVal
selected_models<-rbind(selected_models,data.frame(Test='DBSLMM.PseudoVal', Label='DBSLMM'))
##
# All methods
##
selected_models<-rbind(selected_models,data.frame(Test='All.MultiPRS', Label='All'))
###
# Subset selected models and format
###
# Eval
selected_models$Label_2<-paste0(gsub('_group','',selected_models$Label),'_group')
res_eval_i_select<-res_eval_i[(res_eval_i$Model %in% selected_models$Label_2),]
res_eval_i_select<-merge(res_eval_i_select, selected_models[c('Test','Label_2')], by.x='Model',by.y='Label_2')
res_eval_i_select$Phenotype<-pheno[i]
res_eval_i_select<-res_eval_i_select[c('Phenotype','Test',"R","SE",'P','R2o','N')]
res_eval_i_select$Population<-pop[k]
res_eval<-rbind(res_eval,res_eval_i_select)
# Comp
selected_models$Label_3<-gsub('_group','',selected_models$Label)
res_comp_i_select<-res_comp_i[(res_comp_i$Model_1 %in% selected_models$Label_3) & (res_comp_i$Model_2 %in% selected_models$Label_3),]
res_comp_i_select<-merge(res_comp_i_select, selected_models[c('Test','Label_3')], by.x='Model_1',by.y='Label_3')
names(res_comp_i_select)[names(res_comp_i_select) == 'Test']<-'Model_1_Test'
res_comp_i_select<-merge(res_comp_i_select, selected_models[c('Test','Label_3')], by.x='Model_2',by.y='Label_3')
names(res_comp_i_select)[names(res_comp_i_select) == 'Test']<-'Model_2_Test'
res_comp_i_select$Label<-NULL
res_comp_i_select$Phenotype<-pheno[i]
res_comp_i_select<-res_comp_i_select[c('Phenotype','Model_1_Test','Model_2_Test','Model_1_R','Model_2_R','R_diff','R_diff_pval')]
res_comp_i_select$Population<-pop[k]
res_comp<-rbind(res_comp,res_comp_i_select)
###
# Create plot showing performance of each method
###
library(ggplot2)
library(cowplot)
res_eval_i_select$Method<-gsub('\\..*','',res_eval_i_select$Test)
res_eval_i_select$Model<-gsub('.*\\.','',res_eval_i_select$Test)
res_eval_i_select$Method<-factor(res_eval_i_select$Method, levels=c('pT+clump','lassosum','PRScs','SBayesR','LDPred2','DBSLMM','All'))
res_eval_i_select$Model<-factor(res_eval_i_select$Model, levels=c('10FCVal','MultiPRS','PseudoVal','Inf'))
res_eval_plots[[j]] <- ggplot( res_eval_i_select, aes(x=Method, y=R, fill=Model)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.7) +
geom_errorbar(aes(ymin=R-SE, ymax=R+SE), width=.2, position=position_dodge(width = 0.7, preserve = "single")) +
labs(y="Correlation (SE)", x='', title=paste0(pheno[i],"(",pop[k],")")) +
coord_cartesian(ylim=c(min(res_eval_i_select$R[res_eval_i_select$Phenotype==pheno[i]])-0.02, max(res_eval_i_select$R[res_eval_i_select$Phenotype==pheno[i]])+0.025), clip="on") +
theme_half_open() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
background_grid(major = 'y', minor = 'y') +
theme(legend.position="top", legend.title = element_blank(), legend.box="vertical", legend.margin=margin()) +
guides(fill=guide_legend(nrow=2))
###
# Create plot showing performance of each method compared to pT+clump
###
res_eval_i_select$R_diff<-res_eval_i_select$R-res_eval_i_select$R[res_eval_i_select$Test == "pT+clump.10FCVal"]
res_eval_diff_plots[[j]] <- ggplot( res_eval_i_select, aes(x=Method, y=R_diff, fill=Model)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.7) +
geom_errorbar(aes(ymin=R_diff-SE, ymax=R_diff+SE), width=.2, position=position_dodge(width = 0.7, preserve = "single")) +
labs(y="Correlation (SE)", x='', title=paste0(pheno[i],"(",pop[k],")")) +
coord_cartesian(ylim=c(min(res_eval_i_select$R_diff[res_eval_i_select$Phenotype==pheno[i]])-0.02, max(res_eval_i_select$R_diff[res_eval_i_select$Phenotype==pheno[i]])+0.025), clip="on") +
theme_half_open() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
background_grid(major = 'y', minor = 'y') +
theme(legend.position="top", legend.title = element_blank(), legend.box="vertical", legend.margin=margin()) +
guides(fill=guide_legend(nrow=2))
###
# Create plot showing matrix of differences with significance
###
library(reshape2)
res_comp_i_select$R_diff[res_comp_i_select$Model_1_Test == res_comp_i_select$Model_2_Test]<-NA
res_comp_i_select$R_diff_catagory<-'NA'
res_comp_i_select$R_diff_catagory[res_comp_i_select$R_diff < -0.002]<-'-0.025 - -0.002'
res_comp_i_select$R_diff_catagory[res_comp_i_select$R_diff < -0.025]<-'-0.08 - -0.025'
res_comp_i_select$R_diff_catagory[res_comp_i_select$R_diff < -0.08]<-'< -0.08'
res_comp_i_select$R_diff_catagory[res_comp_i_select$R_diff > -0.002 & res_comp_i_select$R_diff < 0.002]<-'-0.002 - 0.002'
res_comp_i_select$R_diff_catagory[res_comp_i_select$R_diff > 0.002]<-'0.002 - 0.025'
res_comp_i_select$R_diff_catagory[res_comp_i_select$R_diff > 0.025]<-'0.025 - 0.08'
res_comp_i_select$R_diff_catagory[res_comp_i_select$R_diff > 0.08]<-'> 0.08'
res_comp_i_select$R_diff_catagory<-factor(res_comp_i_select$R_diff_catagory, level=rev(c('< -0.08','-0.08 - -0.025','-0.025 - -0.002','-0.002 - 0.002','0.002 - 0.025','0.025 - 0.08','> 0.08')))
res_comp_i_select$star<-' '
res_comp_i_select$star[res_comp_i_select$R_diff_pval < 0.05]<-'*'
res_comp_i_select$star[res_comp_i_select$R_diff_pval < 1e-3]<-'**'
res_comp_i_select$star[res_comp_i_select$R_diff_pval < 1e-6]<-'***'
library(RColorBrewer)
res_eval_diff_matrix[[j]]<-ggplot(data = res_comp_i_select, aes(Model_2_Test, Model_1_Test, fill=R_diff_catagory))+
geom_tile(color = "white")+
labs(y='Test', x='Comparison', title=paste0(pheno[i],"(",pop[k],")"), fill='R difference') +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 8, hjust = 1))+
coord_fixed() +
geom_text(data=res_comp_i_select, aes(Model_2_Test, Model_1_Test, label = star), color = "black", size = 4, angle = 0, vjust=0.8) +
scale_fill_brewer(breaks = levels(res_comp_i_select$R_diff_catagory), palette = "RdBu", drop=F, na.value='grey')
j<-j+1
}
}
write.csv(res_eval, paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_intern.csv'), row.names=F, quote=F)
write.csv(res_comp, paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_diff_intern.csv'), row.names=F, quote=F)
png(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_intern.png'), units='px', res=300, width=2450, height=1000*ceiling(length(res_eval_plots)/2))
print(plot_grid(plotlist=res_eval_plots, ncol = 2))
dev.off()
png(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_pTDiff_intern.png'), units='px', res=300, width=2450, height=1000*ceiling(length(res_eval_plots)/2))
print(plot_grid(plotlist=res_eval_diff_plots, ncol = 2))
dev.off()
#png(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_diff_intern.png'), units='px', res=300, width=2450, height=1750*length(res_eval_diff_matrix))
#print(plot_grid(plotlist=res_eval_diff_matrix, ncol = 1))
#dev.off()
###################
# Average results across phenotypes, creates tables and figures
###################
res_eval_all<-res_eval
res_comp_all<-res_comp
meta_res_eval<-NULL
meta_res_comp<-NULL
meta_res_eval_plots<-list()
meta_res_eval_diff_plots<-list()
meta_res_eval_diff_matrix<-list()
meta_res_eval_all<-NULL
for(k in 1:length(pop)){
res_eval<-res_eval_all[res_eval_all$Population == pop[k],]
res_comp<-res_comp_all[res_comp_all$Population == pop[k],]
res_eval$Method<-gsub('\\..*','',res_eval$Test)
res_eval$Model<-gsub('.*\\.','',res_eval$Test)
res_eval$Phenotype<-factor(res_eval$Phenotype, level=unique(res_eval$Phenotype))
res_eval$Method<-factor(res_eval$Method, level=c("pT+clump",'lassosum','PRScs','SBayesR','LDPred2','DBSLMM','All'))
res_eval$Model<-factor(res_eval$Model, level=c('10FCVal','MultiPRS','PseudoVal','Inf'))
res_eval<-res_eval[order(res_eval$Phenotype, res_eval$Method, res_eval$Model),]
library(data.table)
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Target_scoring.config')
for(pheno_i in 1:length(pheno)){
if(pheno_i==1){
pheno_i_dat<-fread(paste0('/users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/Residuals/',pop[k],'/',pheno[pheno_i],'_resid.pheno'))
names(pheno_i_dat)[3]<-pheno[pheno_i]
pheno_dat<-pheno_i_dat
} else {
pheno_i_dat<-fread(paste0('/users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/Residuals/',pop[k],'/',pheno[pheno_i],'_resid.pheno'))
names(pheno_i_dat)[3]<-pheno[pheno_i]
pheno_dat<-merge(pheno_dat, pheno_i_dat, by=c('FID','IID'), all=T)
}
}
# Calculate corelation between all phenotypes
cors<-abs(cor(as.matrix(pheno_dat[,-1:-2]), use='p'))
cors<-cors[unique(res_eval$Phenotype,),unique(res_eval$Phenotype,)]
cors[is.na(cors)]<-0
# Run aggregate function
library(MAd)
meta_res_eval<-NULL
for(i in unique(res_eval$Test)){
res_eval_i<-res_eval[res_eval$Test == i,]
res_eval_i$Sample<-'Target'
cors_i<-cors[(dimnames(cors)[[1]] %in% res_eval_i$Phenotype),(dimnames(cors)[[1]] %in% res_eval_i$Phenotype)]
R_agg_res_eval_i<-agg(id=Sample, es=R, var=SE^2, cor=cors_i, method="BHHR", mod=NULL, data=res_eval_i)
meta_res_eval<-rbind(meta_res_eval, data.frame( Test=i,
R=R_agg_res_eval_i$es[1],
R_SE=sqrt(R_agg_res_eval_i$var[1])))
}
# Calculate difference between methods and p-value using aggregate
# Truncate difference p-values to stop 0 values
res_comp$R_diff_pval[res_comp$R_diff_pval == 0]<-1e-320
meta_res_diff<-NULL
for(i in unique(res_eval$Test)){
for(j in unique(res_eval$Test)){
meta_res_eval_i<-meta_res_eval[meta_res_eval$Test == i,]
meta_res_eval_j<-meta_res_eval[meta_res_eval$Test == j,]
res_comp_i_j<-res_comp[res_comp$Model_1_Test == i & res_comp$Model_2_Test == j,]
# Calculate diff SE based on p-value
for(m in 1:dim(res_comp_i_j)[1]){
if(res_comp_i_j$R_diff_pval[m] == 1){
res_comp_i_j$R_diff_pval[m]<-1-0.001
}
}
res_comp_i_j$R_diff_z<-qnorm(res_comp_i_j$R_diff_pval/2)
res_comp_i_j$R_diff_SE<-res_comp_i_j$R_diff/res_comp_i_j$R_diff_z
res_comp_i_j$Sample<-'A'
cors_i<-cors[(dimnames(cors)[[1]] %in% res_comp_i_j$Phenotype),(dimnames(cors)[[1]] %in% res_comp_i_j$Phenotype)]
R_agg_res_comp_i_j<-agg(id=Sample, es=R_diff, var=R_diff_SE^2, cor=cors_i, method="BHHR", mod=NULL, data=res_comp_i_j)
meta_res_diff<-rbind(meta_res_diff, data.frame(Test_ref=i,
Test_targ=j,
R_diff=R_agg_res_comp_i_j$es[1],
R_diff_SE=sqrt(R_agg_res_comp_i_j$var[1])))
}
}
meta_res_diff$R_diff_Z<-meta_res_diff$R_diff/meta_res_diff$R_diff_SE
meta_res_diff$R_diff_P<-2*pnorm(-abs(meta_res_diff$R_diff_Z))
meta_res_diff<-meta_res_diff[,c("Test_ref","Test_targ","R_diff","R_diff_Z","R_diff_P")]
meta_res_diff$R_diff_Z[meta_res_diff$R_diff == 0]<-0
meta_res_diff$R_diff_P[meta_res_diff$R_diff == 0]<-1
# Calculate percentage improvement
meta_res_diff$R_diff_perc<-NA
for(i in unique(meta_res_diff$Test_ref)){
meta_res_diff$R_diff_perc[meta_res_diff$Test_ref == i]<-meta_res_diff$R_diff[meta_res_diff$Test_ref == i]/meta_res_eval$R[meta_res_eval$Test == i]*100
}
write.csv(meta_res_eval, paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_',pop[k],'_meta.csv'), row.names=F, quote=F)
meta_res_diff<-meta_res_diff[,c("Test_ref","Test_targ","R_diff",'R_diff_perc',"R_diff_Z","R_diff_P")]
write.csv(meta_res_diff, paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_',pop[k],'_diff_meta.csv'), row.names=F, quote=F)
###
# Create plot showing performance of each method
###
# To avoid unequal groups, insert NA for missing Model/Method pairs
meta_res_eval$Method<-gsub('\\..*','',meta_res_eval$Test)
meta_res_eval$Model<-gsub('.*\\.','',meta_res_eval$Test)
meta_res_eval$Method<-factor(meta_res_eval$Method, levels=c("pT+clump",'lassosum','PRScs','SBayesR','LDPred2','DBSLMM','All'))
meta_res_eval$Model<-factor(meta_res_eval$Model, levels=c('10FCVal','MultiPRS','PseudoVal','Inf'))
for(i in unique(meta_res_eval$Method)){
for(j in unique(meta_res_eval$Model)){
if(dim(meta_res_eval[meta_res_eval$Method == i & meta_res_eval$Model == j,])[1] > 0){
next
} else {
dud<-data.frame(Test=NA,
R=NA,
R_SE=NA,
Method=i,
Model=j)
meta_res_eval<-rbind(meta_res_eval,dud)
}
}
}
res_eval<-res_eval[,c('Phenotype','Test','R','Method','Model')]
for(i in unique(res_eval$Method)){
for(j in unique(res_eval$Model)){
if(dim(res_eval[res_eval$Method == i & res_eval$Model == j,])[1] > 0){
next
} else {
dud<-data.frame(Phenotype=NA,
Test=paste0(i,'.',j),
R=NA,
Method=i,
Model=j)
res_eval<-rbind(res_eval,dud)
}
}
}
library(ggplot2)
library(cowplot)
meta_res_eval_plots[[pop[k]]] <- ggplot( meta_res_eval, aes(x=Method, y=R, fill=Model)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.7) +
geom_errorbar(aes(ymin=R-R_SE, ymax=R+R_SE), width=.2, position=position_dodge(width = 0.7, preserve = "single")) +
labs(y="Correlation (SE)", x='', title=pop[k]) +
theme_half_open() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_vline(xintercept = 1.5, linetype="dotted") +
geom_vline(xintercept = 2.5, linetype="dotted") +
geom_vline(xintercept = 3.5, linetype="dotted") +
geom_vline(xintercept = 4.5, linetype="dotted") +
geom_vline(xintercept = 5.5, linetype="dotted") +
geom_vline(xintercept = 6.5, linetype="dotted") +
geom_vline(xintercept = 7.5, linetype="dotted") +
geom_vline(xintercept = 8.5, linetype="dotted") +
background_grid(major = 'y', minor = 'y')
meta_res_eval_plots[[pop[k]]]<-meta_res_eval_plots[[pop[k]]] + coord_cartesian(ylim=c(0,0.5), clip="on")
###
# Create plot showing performance of each method compared to pT+clump
###
# Calculate differences at the average and phenotype specific level
meta_res_eval$R_diff<-meta_res_eval$R-meta_res_eval$R[which(meta_res_eval$Test == "pT+clump.10FCVal")]
res_eval$R_diff<-NA
for(i in pheno){
res_eval$R_diff[which(res_eval$Phenotype == i)]<-res_eval$R[which(res_eval$Phenotype == i)]-res_eval$R[which(res_eval$Test == "pT+clump.10FCVal" & res_eval$Phenotype == i)]
}
meta_res_eval$Method<-factor(meta_res_eval$Method, levels=c("pT+clump",'lassosum','PRScs','SBayesR','LDPred2','DBSLMM','All'))
meta_res_eval$Model<-factor(meta_res_eval$Model, levels=c('10FCVal','MultiPRS','PseudoVal','Inf'))
meta_res_eval_diff_plots[[pop[k]]] <- ggplot( meta_res_eval, aes(x=Method, y=R_diff, fill=Model)) +
geom_point(data=res_eval, mapping=aes(x=Method, y=R_diff, colour=Model), position=position_jitterdodge(jitter.width = 0.4, dodge.width = 0.7), alpha=0.3) +
geom_errorbar(aes(ymin=R_diff-R_SE, ymax=R_diff+R_SE), width=0.9, position=position_dodge(width = 0.7)) +
geom_point(stat="identity", position=position_dodge(0.7), size=2, shape=23, colour='black') +
labs(y="Difference in\nCorrelations (SE)", x='', title=pop[k]) +
theme_half_open() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_vline(xintercept = 1.5, linetype="dotted") +
geom_vline(xintercept = 2.5, linetype="dotted") +
geom_vline(xintercept = 3.5, linetype="dotted") +
geom_vline(xintercept = 4.5, linetype="dotted") +
geom_vline(xintercept = 5.5, linetype="dotted") +
geom_vline(xintercept = 6.5, linetype="dotted") +
geom_vline(xintercept = 7.5, linetype="dotted") +
geom_vline(xintercept = 8.5, linetype="dotted") +
background_grid(major = 'y', minor = 'y') +
ylim(c(-0.08,0.12))
###
# Create plot showing matrix of differences with significance
###
library(reshape2)
meta_res_diff$R_diff[meta_res_diff$Test_ref == meta_res_diff$Test_targ]<-NA
meta_res_diff$R_diff_catagory<-'NA'
meta_res_diff$R_diff_catagory[meta_res_diff$R_diff < -0.002]<-'-0.025 - -0.002'
meta_res_diff$R_diff_catagory[meta_res_diff$R_diff < -0.025]<-'-0.08 - -0.025'
meta_res_diff$R_diff_catagory[meta_res_diff$R_diff < -0.08]<-'< -0.08'
meta_res_diff$R_diff_catagory[meta_res_diff$R_diff > -0.002 & meta_res_diff$R_diff < 0.002]<-'-0.002 - 0.002'
meta_res_diff$R_diff_catagory[meta_res_diff$R_diff > 0.002]<-'0.002 - 0.025'
meta_res_diff$R_diff_catagory[meta_res_diff$R_diff > 0.025]<-'0.025 - 0.08'
meta_res_diff$R_diff_catagory[meta_res_diff$R_diff > 0.08]<-'> 0.08'
meta_res_diff$R_diff_catagory<-factor(meta_res_diff$R_diff_catagory, level=rev(c('< -0.08','-0.08 - -0.025','-0.025 - -0.002','-0.002 - 0.002','0.002 - 0.025','0.025 - 0.08','> 0.08')))
meta_res_diff$star<-' '
meta_res_diff$star[meta_res_diff$R_diff_P < 0.05]<-'*'
meta_res_diff$star[meta_res_diff$R_diff_P < 1e-3]<-'**'
meta_res_diff$star[meta_res_diff$R_diff_P < 1e-6]<-'***'
library(RColorBrewer)
meta_res_eval_diff_matrix[[pop[k]]]<-ggplot(data = meta_res_diff, aes(Test_targ, Test_ref, fill=R_diff_catagory))+
geom_tile(color = "white")+
labs(y='Test', x='Comparison', title=pop[k], fill='R difference') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 8, hjust = 1)) +
coord_fixed() +
geom_text(data=meta_res_diff, aes(Test_ref, Test_targ, label = star), color = "black", size = 4, angle = 0, vjust=0.8) +
scale_fill_brewer(breaks = levels(meta_res_diff$R_diff_catagory), palette = "RdBu", drop=F, na.value='grey')
meta_res_eval$Population<-pop[k]
meta_res_eval_all<-rbind(meta_res_eval_all, meta_res_eval)
}
png('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_meta.png', units='px', res=300, width=3200, height=3000)
plot_grid(plotlist=meta_res_eval_plots, ncol = 2)
dev.off()
png('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_pTDiff_meta.png', units='px', res=300, width=3200, height=3000)
plot_grid(plotlist=meta_res_eval_diff_plots, ncol = 2)
dev.off()
png('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_diff_meta.png', units='px', res=300, width=3750, height=6000)
plot_grid(plotlist=meta_res_eval_diff_matrix, ncol = 2)
dev.off()
###
# Plot portability
###
# Pairwise methods comparison
res_port_plots<-list()
for(k in 1:length(pop)){
###
# Estimate portability
###
if(pop[k] != 'EUR_test'){
meta_res_eval_EUR<-meta_res_eval_all[which(meta_res_eval_all$Population == 'EUR_test'),]
meta_res_eval_EUR<-meta_res_eval_EUR[complete.cases(meta_res_eval_EUR),]
meta_res_eval_k<-meta_res_eval_all[which(meta_res_eval_all$Population == pop[k]),]
meta_res_eval_k<-meta_res_eval_k[complete.cases(meta_res_eval_k),]
meta_res_eval_k<-meta_res_eval_k[match(meta_res_eval_EUR$Test, meta_res_eval_k$Test),]
meta_res_eval_k$R_port<-meta_res_eval_k$R/meta_res_eval_EUR$R
meta_res_eval_k$R2_port<-(meta_res_eval_k$R^2)/(meta_res_eval_EUR$R^2)
meta_res_eval_k$Method<-gsub('\\..*','',meta_res_eval_k$Test)
meta_res_eval_k$Model<-gsub('.*\\.','',meta_res_eval_k$Test)
meta_res_eval_k$Method<-factor(meta_res_eval_k$Method, levels=c('pT+clump','lassosum','PRScs','SBayesR','LDPred2','DBSLMM','All'))
meta_res_eval_k$Model<-factor(meta_res_eval_k$Model, levels=c('10FCVal','MultiPRS','PseudoVal','Inf'))
res_port_plots[[pop[k]]]<-ggplot( meta_res_eval_k, aes(x=Method, y=R2_port, fill=Model)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.7) +
labs(y="Portability (Pairwise)", x='', title=pop[k]) +
ylim(0,1.25) +
theme_half_open() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
background_grid(major = 'y', minor = 'y') +
theme(legend.position="top", legend.title = element_blank(), legend.box="vertical", legend.margin=margin()) +
guides(fill=guide_legend(nrow=2))
}
}
png(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_intern_portability_R2.png'), units='px', res=300, width=2450, height=1000*ceiling(length(res_port_plots)/2))
plot_grid(plotlist=res_port_plots, ncol = 2)
dev.off()
# Compared to EUR pt+clump
res_port_plots<-list()
for(k in 1:length(pop)){
###
# Estimate portability
###
if(pop[k] != 'EUR_test'){
meta_res_eval_EUR<-meta_res_eval_all[which(meta_res_eval_all$Population == 'EUR_test' & meta_res_eval_all$Test == 'pT+clump.10FCVal'),]
meta_res_eval_k<-meta_res_eval_all[which(meta_res_eval_all$Population == pop[k]),]
meta_res_eval_k<-meta_res_eval_k[complete.cases(meta_res_eval_k),]
meta_res_eval_k$R_port<-meta_res_eval_k$R/meta_res_eval_EUR$R
meta_res_eval_k$R2_port<-(meta_res_eval_k$R^2)/(meta_res_eval_EUR$R^2)
meta_res_eval_k$Method<-gsub('\\..*','',meta_res_eval_k$Test)
meta_res_eval_k$Model<-gsub('.*\\.','',meta_res_eval_k$Test)
meta_res_eval_k$Method<-factor(meta_res_eval_k$Method, levels=c('pT+clump','lassosum','PRScs','SBayesR','LDPred2','DBSLMM','All'))
meta_res_eval_k$Model<-factor(meta_res_eval_k$Model, levels=c('10FCVal','MultiPRS','PseudoVal','Inf'))
res_port_plots[[pop[k]]]<-ggplot( meta_res_eval_k, aes(x=Method, y=R2_port, fill=Model)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single"), width = 0.7) +
labs(y="Portability (pT+Clump)", x='', title=pop[k]) +
ylim(0,1.25) +
theme_half_open() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
background_grid(major = 'y', minor = 'y') +
theme(legend.position="top", legend.title = element_blank(), legend.box="vertical", legend.margin=margin()) +
guides(fill=guide_legend(nrow=2))
}
}
png(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/PRS_method_comp_intern_portability_R2_ptClump.png'), units='px', res=300, width=2450, height=1000*ceiling(length(res_port_plots)/2))
plot_grid(plotlist=res_port_plots, ncol = 2)
dev.off()
Show comparison results
Show code
pheno<-fread('/users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt', header=F)$V1
pheno<-pheno[1:5]
pop<-c('AFR','AMR','EAS','EUR_test','SAS')
library(data.table)
res<-list()
for(i in 1:length(pheno)){
res_pheno<-NULL
for(k in 1:length(pop)){
tmp<-fread(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.',pheno[i],'.',pop[k],'-GeRSs.',pop[k],'-PRSs.pt_clump.pred_eval.txt'))
tmp<-tmp[!grepl('PredFile', tmp$Model),]
tmp2<-data.frame( Population=pop[k],
Phenotype=pheno[i],
tmp)
res_pheno<-rbind(res_pheno, tmp2)
}
res_pheno$Model<-gsub('_group','',res_pheno$Model)
res_pheno$Model<-factor(res_pheno$Model, levels=unique(res_pheno$Model))
res[[pheno[i]]]<-res_pheno
}
library(ggplot2)
library(cowplot)
plot_list<-NULL
for(i in 1:length(pheno)){
plot_list[[pheno[i]]]<-ggplot(res[[pheno[i]]], aes(x=factor(Model), y=R, colour=Population)) +
geom_point(stat="identity", position=position_dodge( width=.30)) +
geom_errorbar(aes(ymin=R-SE, ymax=R+SE), width=.2, position=position_dodge(.30)) +
theme_half_open() +
theme(axis.text.x = element_text(angle = 55, vjust = 1, hjust=1), plot.title = element_text(hjust = 0.4, size=12)) +
background_grid(major = 'y', minor = 'y') +
labs(x=NULL,y="Correlation (SE)", title=pheno[i])
}
png('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/UKB.Diverse.GeRS_vs_pTclump_intern.png', units='px', res=300, width=2250, height=2500)
plot_grid(plotlist=plot_list, ncol = 2)
dev.off()
res_2<-list()
for(i in 1:length(pheno)){
res_2_pheno<-NULL
for(k in 1:length(pop)){
tmp<-fread(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRS_and_GeRSs/UKBB.w_hm3.AllTissue.',pheno[i],'.',pop[k],'-GeRSs.',pop[k],'-PRSs.pt_clump.pred_comp.txt'))
tmp<-tmp[tmp$Model_1 == 'All' & tmp$Model_2 == 'PRS',]
tmp2<-data.frame( Population=pop[k],
Phenotype=pheno[i],
tmp)
res_2_pheno<-rbind(res_2_pheno, tmp2)
}
res_2_pheno$Model_1<-'GeRS + PRS'
res_2_pheno$Model_2<-'PRS only'
res_2[[pheno[i]]]<-res_2_pheno
}
res_2_all<-do.call(rbind, res_2)
write.csv(res_2_all, '/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/UKB.Diverse.GeRS_vs_pTclump_intern.csv', row.names=F, quote=F)
Show PRS vs. GeRS results
Population | Phenotype | Model_1 | Model_2 | Model_1_R | Model_2_R | R_diff | R_diff_pval |
---|---|---|---|---|---|---|---|
AFR | Height | GeRS + PRS | PRS only | 0.233 | 0.230 | 0.002 | 1.95e-01 |
AMR | Height | GeRS + PRS | PRS only | 0.397 | 0.396 | 0.001 | 5.66e-01 |
EAS | Height | GeRS + PRS | PRS only | 0.339 | 0.336 | 0.003 | 2.56e-01 |
EUR_test | Height | GeRS + PRS | PRS only | 0.528 | 0.523 | 0.006 | 7.80e-18 |
SAS | Height | GeRS + PRS | PRS only | 0.394 | 0.391 | 0.002 | 8.98e-02 |
AFR | DBP | GeRS + PRS | PRS only | 0.081 | 0.080 | 0.002 | 8.11e-01 |
AMR | DBP | GeRS + PRS | PRS only | 0.126 | 0.140 | -0.014 | 2.33e-01 |
EAS | DBP | GeRS + PRS | PRS only | 0.186 | 0.183 | 0.003 | 6.11e-01 |
EUR_test | DBP | GeRS + PRS | PRS only | 0.220 | 0.211 | 0.009 | 1.22e-10 |
SAS | DBP | GeRS + PRS | PRS only | 0.158 | 0.153 | 0.006 | 8.10e-02 |
AFR | SBP | GeRS + PRS | PRS only | 0.095 | 0.085 | 0.010 | 1.11e-01 |
AMR | SBP | GeRS + PRS | PRS only | 0.165 | 0.158 | 0.008 | 4.59e-01 |
EAS | SBP | GeRS + PRS | PRS only | 0.183 | 0.179 | 0.005 | 5.97e-01 |
EUR_test | SBP | GeRS + PRS | PRS only | 0.221 | 0.209 | 0.012 | 2.11e-15 |
SAS | SBP | GeRS + PRS | PRS only | 0.177 | 0.173 | 0.005 | 1.12e-01 |
AFR | BMI | GeRS + PRS | PRS only | 0.150 | 0.150 | 0.000 | 9.07e-01 |
AMR | BMI | GeRS + PRS | PRS only | 0.209 | 0.213 | -0.003 | 3.70e-01 |
EAS | BMI | GeRS + PRS | PRS only | 0.182 | 0.187 | -0.004 | 1.08e-01 |
EUR_test | BMI | GeRS + PRS | PRS only | 0.327 | 0.320 | 0.007 | 9.26e-17 |
SAS | BMI | GeRS + PRS | PRS only | 0.244 | 0.243 | 0.001 | 6.33e-01 |
AFR | WBC | GeRS + PRS | PRS only | 0.118 | 0.111 | 0.006 | 2.70e-01 |
AMR | WBC | GeRS + PRS | PRS only | 0.215 | 0.219 | -0.004 | 5.72e-01 |
EAS | WBC | GeRS + PRS | PRS only | 0.196 | 0.202 | -0.007 | 1.18e-01 |
EUR_test | WBC | GeRS + PRS | PRS only | 0.303 | 0.291 | 0.013 | 2.76e-24 |
SAS | WBC | GeRS + PRS | PRS only | 0.249 | 0.244 | 0.005 | 1.33e-01 |
Plot comparison results
library(ggplot2)
library(cowplot)
pheno<-c('BMI','Height')
gwas<-c('BODY04','HEIG03')
pred_eval_pop<-NULL
pred_comp_pop<-NULL
for(pop in c('AFR','AMR','EAS','EUR','SAS')){
for(i in 1:length(gwas)){
pred_eval<-read.table(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRS_and_PCs/UKBB.w_hm3.',gwas[i],'.',pop,'-PCs-PRSs.pt_clump.pred_eval.txt'), header=T, stringsAsFactors=F)
pred_eval$Phenotype<-pheno[i]
pred_eval$Population<-pop
pred_eval_pop<-rbind(pred_eval_pop, pred_eval)
pred_comp<-read.table(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRS_and_PCs/UKBB.w_hm3.',gwas[i],'.',pop,'-PCs-PRSs.pt_clump.pred_comp.txt'), header=T, stringsAsFactors=F)
pred_comp$Phenotype<-pheno[i]
pred_comp$Population<-pop
pred_comp_pop<-rbind(pred_comp_pop, pred_comp)
}
}
pred_eval_pop$Model<-gsub('_group','', pred_eval_pop$Model)
pred_eval_pop$Model<-factor(pred_eval_pop$Model, levels = c('PCs',"PRS",'All'))
png('/scratch/users/k1806347/Analyses/DiverseAncestry/PC_vs_PRS.png', units='px', res=300, width=2000, height=1000)
ggplot(pred_eval_pop, aes(x=Phenotype, y=R, fill=Model)) +
geom_bar(stat="identity", position=position_dodge(), width = 0.7) +
geom_errorbar(aes(ymin=R-SE, ymax=R+SE), width=.2, position=position_dodge(width = 0.7)) +
labs(y="Correlation (SE)", x='') +
theme_half_open() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_blank()) +
background_grid(major = 'y', minor = 'y') +
facet_grid(. ~ Population)
dev.off()
write.csv(pred_comp_pop, '/scratch/users/k1806347/Analyses/DiverseAncestry/PC_vs_PRS.csv', quote=F, row.names=F)
Plot comparison results
library(ggplot2)
library(cowplot)
pheno<-read.table('/users/k1806347/brc_scratch/Misc/Toshiki/Phenotypes/liang_pheno_names.txt')$V1
pred_eval_pop<-NULL
pred_comp_pop<-NULL
for(pop in c('AFR','AMR','EAS','EUR_test','SAS')){
for(i in 1:length(pheno)){
pred_eval<-read.table(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRS_and_PCs/UKBB.w_hm3.',pheno[i],'.',pop,'-PCs-PRSs.pt_clump-residuals.pred_eval.txt'), header=T, stringsAsFactors=F)
pred_eval$Phenotype<-pheno[i]
pred_eval$Population<-pop
pred_eval_pop<-rbind(pred_eval_pop, pred_eval)
pred_comp<-read.table(paste0('/users/k1806347/brc_scratch/Analyses/DiverseAncestry/UKBB_outcomes_for_prediction/',pheno[i],'/Association_withPRS_and_PCs/UKBB.w_hm3.',pheno[i],'.',pop,'-PCs-PRSs.pt_clump-residuals.pred_comp.txt'), header=T, stringsAsFactors=F)
pred_comp$Phenotype<-pheno[i]
pred_comp$Population<-pop
pred_comp_pop<-rbind(pred_comp_pop, pred_comp)
}
}
pred_eval_pop$Model<-gsub('_group','', pred_eval_pop$Model)
pred_eval_pop$Model<-gsub('PRS.resid',"PRS (resid)", pred_eval_pop$Model)
pred_eval_pop$Model<-factor(pred_eval_pop$Model, levels = c('PCs',"PRS (resid)",'All'))
png('/scratch/users/k1806347/Analyses/DiverseAncestry/PC_vs_PRS_residuals_intern.png', units='px', res=300, width=3500, height=5000)
ggplot(pred_eval_pop, aes(x=Phenotype, y=R, fill=Model)) +
geom_bar(stat="identity", position=position_dodge(), width = 0.7) +
geom_errorbar(aes(ymin=R-SE, ymax=R+SE), width=.2, position=position_dodge(width = 0.7)) +
labs(y="Correlation (SE)", x='') +
theme_half_open() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_blank()) +
background_grid(major = 'y', minor = 'y') +
facet_grid(Population ~ .)
dev.off()
write.csv(pred_comp_pop, '/scratch/users/k1806347/Analyses/DiverseAncestry/PC_vs_PRS_residuals_intern.csv', quote=F, row.names=F)
The PRS methods comparison is useful and should be finished.
The GeRS + PRS method is showing far less encouraging results when using the internally derived GWAS sumstats. Seems GeRS are capturing something important in the HEIG and BODY GWAS that the standard PRS misses. Incontrast, the gains are minimal in UKB derived GWAS. Should this analysis be finished or abandonded.
To do: