To evaluate genotype-based prediction approaches, I am using a range of phenotypes measured in the UK Biobank and TEDS samples.

1 UKBB

Phenotype:

Depression
Intelligence
BMI
Height
CAD (Coronary artery disease)
T2D (Type II diabetes)
Inflammatory Bowel Disorder (IBD)
Rheumatoid arthritis (RheumArth)
Multiple Slerosis (MultiScler)
Prostate Cancer
Breast Cancer

Below I provide a description of how each phenotype was derived and some descriptive statistics.

1.1 Depression

The depression phenotype was shared with me by Kylie Glanville.

Preparation of Depression phenotype

######
# Modify depression phenotype file so IDs match genotypic data
######

module add apps/R/3.6.0
R

source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
library(data.table)
pheno<-fread(paste0(UKBB_output, '/Phenotype/Depression/ever_depressed_pheno_final.txt'))
fam<-fread(paste0(UKBB_output,'/UKBB/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr22.fam')
names(fam)[1:2]<-c('FID','IID')

pheno_fam<-merge(pheno, fam, by=c('IID'))
pheno_new<-pheno_fam[,c('FID.y','IID','Depressed.Ever.Clean'), with=F]
names(pheno_new)[1]<-'FID'

fwrite(pheno_new, paste0(UKBB_output,'/Phenotype/Depression/ever_depressed_pheno_final.UpdateIDs.txt'), sep=' ', na='NA')

q()
n

1.2 Intelligence

The Intelligence phenotype is derived from the Fluid.intelligence.score.0.0 (f.20191.0.0) variable.

Preparation of Intelligence phenotype

#####
# Extract intelligence variable, update IDs and calculate descriptives
#####
module add apps/R/3.6.0
R

source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')

# Read in the field finder to find he code for the phenotype of interest.
field_finder<-read.table(paste0(ukbb_pheno_file,'/ukb18177_glanville_field_finder.txt'), stringsAsFactors=F)
field_finder[grepl('Fluid.intelligence', field_finder$V3),]
# The Fluid intelligence variable has a code of f.20191.0.0 and is called Fluid.intelligence.score.0.0

# Identify which column in the phenotype file contains the relevant code
library(data.table)
pheno_header<-names(data.frame(fread(cmd=paste0('head -n 1 ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt'))))
rel_cols<-c(1,which(grepl('f.20191.0.0',pheno_header)))

# Read in the IID and fluid intelligence columns
pheno<-data.frame(fread(cmd=paste0('cut -f ',paste(rel_cols,collapse=','),' ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt')))
names(pheno)<-c('IID','Fluid.intelligence.score')

# Remove individuals with missing phenotype data
pheno<-pheno[complete.cases(pheno),]

# Calculate descriptive statistics for the phenotype
library(e1071)
descriptives<-data.frame(Phenotype='Fluid.intelligence.score',
                        Code='f.20127.0.0',
                        Mean=mean(pheno$Fluid.intelligence.score),
                        SD=sd(pheno$Fluid.intelligence.score),
                        Min=min(pheno$Fluid.intelligence.score),
                        Max=max(pheno$Fluid.intelligence.score),
                        Skewness=skewness(pheno$Fluid.intelligence.score),
                        N=dim(pheno)[1])

system(paste0('mkdir ',UKBB_output,'/Phenotype/Intelligence'))
write.table(descriptives,paste0(UKBB_output,'/Phenotype/Intelligence/Descriptives.txt'), row.names=F, quote=F)

# Plot the distribution
png(paste0(UKBB_output,'/Phenotype/Intelligence/Histogram.png'))
hist(pheno$Fluid.intelligence.score)
dev.off()

# Write out the phenotype in PLINK format.
pheno_plink<-data.frame(FID=pheno$IID,
                        IID=pheno$IID,
                        Fluid.intelligence.score=pheno$Fluid.intelligence.score)
                        
fwrite(pheno_plink, paste0(UKBB_output,'/Phenotype/Intelligence/UKBB_Fluid.intelligence.score.pheno'), sep='\t')

q()
n

Descriptives of Intelligence phenotype

Figure 1: Distribution of Intelligence in UK Biobank

Descriptives of Intelligence in UK Biobank
Phenotype	Code	Mean	SD	Min	Max	Skewness	N
Fluid.intelligence.score	f.20127.0.0	5.434	2.039	0	13	0.155	123656

1.3 Height

The Height phenotype is derived from the Standing.height.0.0 (f.50.0.0) variable.

Preparation of Height phenotype

#####
# Extract Height variable, update IDs and calculate descriptives
#####
module add apps/R/3.6.0
R

source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')

# Read in the field finder to find he code for the phenotype of interest.
field_finder<-read.table(paste0(ukbb_pheno_file,'/ukb18177_glanville_field_finder.txt'), stringsAsFactors=F)
field_finder[grepl('height', field_finder$V3),]
# The best height variable has a code of f.50.0.0 and is called Standing.height.0.0

# Identify which column in the phenotype file contains the relevant code
library(data.table)
pheno_header<-names(data.frame(fread(cmd=paste0('head -n 1 ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt'))))
rel_cols<-c(1,which(grepl('f.50.0.0',pheno_header)))

# Read in the IID and fluid intelligence columns
pheno<-data.frame(fread(cmd=paste0('cut -f ',paste(rel_cols,collapse=','),' ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt')))
names(pheno)<-c('IID','Height')

# Remove individuals with missing phenotype data
pheno<-pheno[complete.cases(pheno),]

# Calculate descriptive statistics for the phenotype
library(e1071)
descriptives<-data.frame(Phenotype='Standing.height.0.0',
                        Code='f.50.0.0',
                        Mean=mean(pheno$Height),
                        SD=sd(pheno$Height),
                        Min=min(pheno$Height),
                        Max=max(pheno$Height),
                        Skewness=skewness(pheno$Height),
                        N=dim(pheno)[1])

system(paste0('mkdir ',UKBB_output,'/Phenotype/Height'))
write.table(descriptives,paste0(UKBB_output,'/Phenotype/Height/Descriptives.txt'), row.names=F, quote=F)

# Plot the distribution
png(paste0(UKBB_output,'/Phenotype/Height/Histogram.png'))
hist(pheno$Height)
dev.off()

# Remove outliers
pheno_no_outlier<-pheno[(pheno$Height < mean(pheno$Height)+(3*sd(pheno$Height))) & (pheno$Height > mean(pheno$Height)-(3*sd(pheno$Height))),]

# Plot the distribution
png(paste0(UKBB_output,'/Phenotype/Height/Histogram_noOutlier.png'))
hist(pheno_no_outlier$Height)
dev.off()

# Write out the phenotype in PLINK format.
pheno_plink<-data.frame(FID=pheno_no_outlier$IID,
                        IID=pheno_no_outlier$IID,
                        Height=pheno_no_outlier$Height)
                        
fwrite(pheno_plink, paste0(UKBB_output,'/Phenotype/Height/UKBB_Height.pheno'), sep='\t')

q()
n

Descriptives of Height phenotype

Distribution of Height in UK Biobank

Distribution of Height in UK Biobank after removal of outliers

Descriptives of Height in UK Biobank
Phenotype	Code	Mean	SD	Min	Max	Skewness	N
Standing.height.0.0	f.50.0.0	168.443	9.279	75	209	0.158	500004

1.4 BMI

The BMI phenotype is derived from the Body.mass.index.BMI.0.0 (f.21001.0.0) variable.

Preparation of BMI phenotype

#####
# Extract BMI variable, update IDs and calculate descriptives
#####

module add apps/R/3.6.0
R

source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')

# Read in the field finder to find he code for the phenotype of interest.
field_finder<-read.table(paste0(ukbb_pheno_file,'/ukb18177_glanville_field_finder.txt'), stringsAsFactors=F)
field_finder[grepl('bmi', field_finder$V3),]
# The best height variable has a code of f.21001.0.0 and is called Body.mass.index.BMI.0.0

# Identify which column in the phenotype file contains the relevant code
library(data.table)
pheno_header<-names(data.frame(fread(cmd=paste0('head -n 1 ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt'))))
rel_cols<-c(1,which(grepl('f.21001.0.0',pheno_header)))

# Read in the IID and fluid intelligence columns
pheno<-data.frame(fread(cmd=paste0('cut -f ',paste(rel_cols,collapse=','),' ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt')))
names(pheno)<-c('IID','BMI')

# Remove individuals with missing phenotype data
pheno<-pheno[complete.cases(pheno),]

# Calculate descriptive statistics for the phenotype
library(e1071)
descriptives<-data.frame(Phenotype='Body.mass.index.BMI.0.0',
                        Code='f.50.0.0',
                        Mean=mean(pheno$BMI),
                        SD=sd(pheno$BMI),
                        Min=min(pheno$BMI),
                        Max=max(pheno$BMI),
                        Skewness=skewness(pheno$BMI),
                        N=dim(pheno)[1])

system(paste0('mkdir ',UKBB_output,'/Phenotype/BMI'))
write.table(descriptives,paste0(UKBB_output,'/Phenotype/BMI/Descriptives.txt'), row.names=F, quote=F)

# Plot the distribution
png(paste0(UKBB_output,'/Phenotype/BMI/Histogram.png'))
hist(pheno$BMI)
dev.off()

# Remove outliers
pheno_no_outlier<-pheno[(pheno$BMI < mean(pheno$BMI)+(3*sd(pheno$BMI))) & (pheno$BMI > mean(pheno$BMI)-(3*sd(pheno$BMI))),]

# Plot the distribution
png(paste0(UKBB_output,'/Phenotype/BMI/Histogram_noOutlier.png'))
hist(pheno_no_outlier$BMI)
dev.off()

# Write out the phenotype in PLINK format.
pheno_plink<-data.frame(FID=pheno_no_outlier$IID,
                        IID=pheno_no_outlier$IID,
                        BMI=pheno_no_outlier$BMI)
                        
fwrite(pheno_plink, paste0(UKBB_output,'/Phenotype/BMI/UKBB_BMI.pheno'), sep='\t')

q()
n

Descriptives of BMI phenotype

Figure 4: Distribution of BMI in UK Biobank

Distribution of BMI in UK Biobank after removal of outliers

Descriptives of BMI in UK Biobank
Phenotype	Code	Mean	SD	Min	Max	Skewness	N
Body.mass.index.BMI.0.0	f.50.0.0	27.433	4.803	12.121	74.684	1.096	499438

1.5 CAD

T2D phenotypes were shared by Saskia Hagenaars and Anna Fürtjes.

Preparation of CAD phenotype

#####
# Modify CAD phenotype file so IDs match genotypic data
#####

module add apps/R/3.6.0
R

source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
library(data.table)
pheno<-fread(paste0(UKBB_output, '/Phenotype/CAD/cad_only_111119.txt'))
names(pheno)[1]<-'IID'
fam<-fread(paste0(UKBB_output,'/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr22.fam'))
names(fam)[1:2]<-c('FID','IID')

pheno_fam<-merge(pheno, fam, by=c('IID'))
pheno_new<-pheno_fam[,c('FID','IID','CAD_all'), with=F]

fwrite(pheno_new, paste0(UKBB_output,'/Phenotype/CAD/cad_only_111119.UpdateIDs.txt'), sep=' ', na='NA')

q()
n

1.6 Type-2 Diabetes (T2D)

T2D phenotypes were shared by Saskia Hagenaars and Anna Fürtjes.

Preparation of T2D phenotype

#####
# Modify T2D phenotype file so IDs match genotypic data
#####

module add apps/R/3.6.0
R
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
library(data.table)
pheno<-fread(paste0(UKBB_output, '/Phenotype/T2D/t2d_only_111119.txt'))
names(pheno)[1]<-'IID'
fam<-fread(paste0(UKBB_output,'/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr22.fam'))
names(fam)[1:2]<-c('FID','IID')

pheno_fam<-merge(pheno, fam, by=c('IID'))
pheno_new<-pheno_fam[,c('FID','IID','t2d_all'), with=F]

fwrite(pheno_new, paste0(UKBB_output,'/Phenotype/T2D/t2d_only_111119.UpdateIDs.txt'), sep=' ', na='NA')

q()
n

1.7 Autoimmune disorders

Phenotypes were shared by Kylie Glanville.

Preparation of Autoimmune disorder phenotype

#####
# Modify Autoimmune Disorder phenotype file so IDs match genotypic data
#####

module add apps/R/3.6.0
R
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
library(data.table)
pheno<-fread(paste0(UKBB_output, '/Phenotype/AutoImmune/new_autoimmune_all'))
pheno$FID<-NULL
fam<-fread(paste0(UKBB_output,'/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr22.fam'))
names(fam)[1:2]<-c('FID','IID')

pheno_fam<-merge(pheno, fam, by=c('IID'))

# Extract probable phnotypes only
pheno_new<-pheno_fam[,c('FID','IID',names(pheno_fam)[grepl('_prob', names(pheno_fam))]), with=F]

# Identify individuals that don't have any autoimmune disorders
pheno_new_control<-pheno_new[which(rowSums(pheno_new[,-1:-2]) == 0),]

# Create a pheno file for Crohns MultiScler RheuArth
pheno_name<-c('IBD','MultiScler','RheuArth')
pheno_col<-c('ibd_prob',"ms_prob","arthritis_prob")
for(i in 1:3){
  pheno_new_i_case<-pheno_new[pheno_new[[pheno_col[i]]] == 1,]
  pheno_clean_i<-rbind(pheno_new_i_case, pheno_new_control)
  pheno_clean_i<-pheno_clean_i[,c('FID','IID',pheno_col[i]), with=F]
  system(paste0('mkdir ',UKBB_output,'/Phenotype/',pheno_name[i]))
  fwrite(pheno_clean_i, paste0(UKBB_output,'/Phenotype/',pheno_name[i],'/UKBB.',pheno_name[i],'.txt'), sep=' ', na='NA', quote=F)

}

q()
n

1.8 Prostate and Breast cancer

Preparation of cancer phenotype

#####
# Extract Height variable, update IDs and calculate descriptives
#####
module add apps/R/3.6.0
R

source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')

# Read in the field finder to find the code for the phenotype of interest.
field_finder<-read.table(paste0(ukbb_pheno_file,'/ukb18177_glanville_field_finder.txt'), stringsAsFactors=F)
field_finder[grepl('Cancer|cancer', field_finder$V3),]
# The self reported cancer variable code is f.20001
# The sex variable code is f.31.0.0

# Identify which column in the phenotype file contains the relevant code
library(data.table)
pheno_header<-names(data.frame(fread(cmd=paste0('head -n 1 ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt'))))
rel_cols<-c(1,which(grepl('f.31.0.0|f.20001',pheno_header)))

# Read in the IID, sex, and cancer columns
pheno<-data.frame(fread(cmd=paste0('cut -f ',paste(rel_cols,collapse=','),' ',ukbb_pheno_file,'/ukb18177_glanville_phenotypes.txt')))
names(pheno)[1:2]<-c('IID','sex')
names(pheno)<-gsub('X..','',names(pheno))
names(pheno)<-gsub('\\.\\.','',names(pheno))

# Update IDs to match genetic data
fam<-fread(paste0(UKBB_output,'/Genotype/Harmonised/UKBB.w_hm3.QCd.AllSNP.chr22.fam'))
names(fam)[1:2]<-c('FID','IID')

pheno<-merge(fam[,c('FID','IID')],pheno, by=c('IID'))

# Identify individuals with prostate cancer ID '1044' in any column
prostate<-NULL
for(i in names(pheno)[-1:-3]){
  prostate<-c(prostate, which(pheno[[i]] == '1044'))
  prostate<-unique(prostate)
}

pheno$prostate_cancer<-NA
pheno$prostate_cancer[pheno$sex == 1]<-0
pheno$prostate_cancer[prostate]<-1

# Identify individuals with breast cancer ID '1002' in any column
breast<-NULL
for(i in names(pheno)[-1:-3]){
  breast<-c(breast, which(pheno[[i]] == '1002'))
  breast<-unique(breast)
}

pheno$breast_cancer<-NA
pheno$breast_cancer[pheno$sex == 0]<-0
pheno$breast_cancer[breast]<-1

# Write out the phenotype in PLINK format.
pheno_plink_prostate<-data.frame(FID=pheno$FID,
                        IID=pheno$IID,
                        prostate_cancer=pheno$prostate_cancer)
                        
pheno_plink_prostate<-pheno_plink_prostate[complete.cases(pheno_plink_prostate),]

pheno_plink_breast<-data.frame(FID=pheno$FID,
                        IID=pheno$IID,
                        breast_cancer=pheno$breast_cancer)

pheno_plink_breast<-pheno_plink_breast[complete.cases(pheno_plink_breast),]

dir.create(paste0(UKBB_output,'/Phenotype/Prostate_Cancer/'))
dir.create(paste0(UKBB_output,'/Phenotype/Breast_Cancer/'))

fwrite(pheno_plink_prostate, paste0(UKBB_output,'/Phenotype/Prostate_Cancer/UKBB_Prostate_Cancer.pheno'), sep='\t')
fwrite(pheno_plink_breast, paste0(UKBB_output,'/Phenotype/Breast_Cancer/UKBB_Breast_Cancer.pheno'), sep='\t')

q()
n

1.9 Create subset of UK Biobank individuals

The UK Biobank sample is very large making the computation of the various scores and derivation of the models computationally intensive. We do not need such large samples to compare PRS methods, and it would also be useful to have a non-overlapping subset of UKBiobank which can be used as a reference sample.

Subsetting phenotypic data

#####
# Idenitfy subset of UK biobank with phenotypic data and European ancestry
#####

module add apps/R/3.6.0
R
source('/users/k1806347/brc_scratch/Software/MyGit/GenoPred/config_used/Phenotype_prep.config')
library(data.table)

pheno<-c('Depression','Intelligence','BMI','Height','T2D','CAD','IBD','MultiScler','RheuArth','Breast_Cancer','Prostate_Cancer')
pheno_file<-c('ever_depressed_pheno_final.UpdateIDs.txt','UKBB_Fluid.intelligence.score.UpdateIDs.pheno','UKBB_BMI.score.UpdateIDs.pheno','UKBB_Height.score.UpdateIDs.pheno','t2d_only_111119.UpdateIDs.txt','cad_only_111119.UpdateIDs.txt','UKBB.IBD.txt','UKBB.MultiScler.txt','UKBB.RheuArth.txt','UKBB_Breast_Cancer.pheno','UKBB_Prostate_Cancer.pheno')

pheno_list<-list()
for(i in 1:length(pheno)){
  pheno_list[[pheno[i]]]<-fread(paste0(UKBB_output,'/Phenotype/',pheno[i],'/',pheno_file[i]))
}

pheno_all<-Reduce(function(...) merge(..., all=T, by=c('FID','IID')), pheno_list)

# We want to retain all individuals that are cases, and those that have complete data.
pheno_all_2<-pheno_all[complete.cases(pheno_all[,!grepl('cancer',names(pheno_all)),with=F]) | pheno_all$Depressed.Ever.Clean == 1 | pheno_all$t2d_all == 1 | pheno_all$CAD_all == 1 | pheno_all$ibd_prob == 1 | pheno_all$ms_prob == 1 | pheno_all$arthritis_prob == 1 | pheno_all$breast_cancer == 1 | pheno_all$prostate_cancer == 1,]

# Extract individuals with EUR ancestry
EUR<-fread(paste0(UKBB_output,'/Projected_PCs/AllAncestry/UKBB.w_hm3.AllAncestry.EUR.keep'))
pheno_all_2_EUR<-pheno_all_2[(pheno_all_2$IID %in% EUR$V2),]

# Extract individuals that survive QC
keep<-fread('/users/k1806347/brc_scratch/Analyses/PRS_comparison/UKBB_outcomes_for_prediction/ukb18177_glanville_post_qc_id_list.UpdateIDs.fam')
pheno_all_2_EUR<-pheno_all_2_EUR[(pheno_all_2_EUR$IID %in% keep$V2),]

# Calculate the number of individuals surviving genotype QC and ancestry check
sum(EUR$V2 %in% keep$V2) # 379437

# Remove individuals in the UKB reference
UKB_ref<-fread('/users/k1806347/brc_scratch/Data/UKBB/UKBB_ref/genotype/UKBB.noPheno.EUR.10K.chr22.fam')
pheno_all_2_EUR<-pheno_all_2_EUR[!(pheno_all_2_EUR$IID %in% UKB_ref$V2),]
# This seems circular as the phenotype file was used to select UKB individuals in the reference. This is due to the addition of breast cancer and prostate cancer phenotype. Using the original set of reference individuals losses <150 cases for each phenotype and saves recreating UKB reference files. 

# There are still >100k individuals. Write out files for each phenotype, retaining individuals with complete data, with a maximum of 50000 individuals for quantitative outcomes.
system(paste0('mkdir ',UKBB_output,'/Phenotype/PRS_comp_subset'))
pheno_N<-NULL
pheno_any<-NULL
for(i in 1:length(pheno)){
  pheno_all_2_EUR_i<-pheno_all_2_EUR[,c('FID','IID',names(pheno_all_2_EUR)[2+i]),with=F]
  pheno_all_2_EUR_i<-pheno_all_2_EUR_i[complete.cases(pheno_all_2_EUR_i),]
  if(sum(pheno_all_2_EUR_i[[3]] == 0 | pheno_all_2_EUR_i[[3]] == 1) == length(pheno_all_2_EUR_i[[3]])){
    pheno_all_2_EUR_i_case<-pheno_all_2_EUR_i[pheno_all_2_EUR_i[[3]] == 1,]
    pheno_all_2_EUR_i_case<-pheno_all_2_EUR_i_case[1:25000,]
    pheno_all_2_EUR_i_case<-pheno_all_2_EUR_i_case[complete.cases(pheno_all_2_EUR_i_case),]
    pheno_all_2_EUR_i_con<-pheno_all_2_EUR_i[pheno_all_2_EUR_i[[3]] == 0,]
    pheno_all_2_EUR_i_con<-pheno_all_2_EUR_i_con[1:(50000 - dim(pheno_all_2_EUR_i_case)[1]),]
    pheno_all_2_EUR_i<-rbind(pheno_all_2_EUR_i_case, pheno_all_2_EUR_i_con)
    
    pheno_N<-rbind(pheno_N,data.frame(Pheno=pheno[i], 
                                      N=length(pheno_all_2_EUR_i[[3]]),
                                      N_case=sum(pheno_all_2_EUR_i[[3]] == 1,na.rm=T),
                                      N_con=sum(pheno_all_2_EUR_i[[3]] == 0,na.rm=T)))
  } else {
    pheno_all_2_EUR_i<-pheno_all_2_EUR_i[1:50000,]
    pheno_N<-rbind(pheno_N,data.frame(Pheno=pheno[i], 
                                      N=length(pheno_all_2_EUR_i[[3]]),
                                      N_case=NA,
                                      N_con=NA))
  }
  
  pheno_any<-rbind(pheno_any,pheno_all_2_EUR_i[,c('FID','IID'),with=F])
  pheno_any<-pheno_any[!duplicated(pheno_any),]
  
  write.table(pheno_all_2_EUR_i, paste0(UKBB_output,'/Phenotype/PRS_comp_subset/UKBB.',pheno[i],'.txt'), col.names=T, row.names=F, quote=F)
}

write.csv(pheno_N, paste0(UKBB_output,'/Phenotype/PRS_comp_subset/Pheno_N.csv'), row.names=F, quote=F)

# Calculate the number of individuals across all samples
dim(pheno_any) # 104278

Training Sample Phenotype Preparation