## @knitr models # this has two splits make_ADmixed_model_train_validate <- function(n, p_design, p_kinship, k, s, Fst, b0, beta_mean, eta, sigma2, geography = c("ind", "1d","circ"), percent_causal, percent_overlap) { # p_design: number of variables in X_design, i.e., the design matrix # p_kinship: number of variable in X_kinship, i.e., matrix used to calculate kinship # k: Number of intermediate subpopulations # s: The desired bias coefficient, which specifies σ indirectly. Required if sigma is missing # F: The length-k vector of inbreeding coefficients (or FST's) of the intermediate subpopulations, # up to a scaling factor (which cancels out in calculations). Required if sigma is missing # Fst: The desired final FST of the admixed individuals. Required if sigma is missing # browser() # define population structure # s <- 0.5 # desired bias coefficient # Fst <- 0.1 # desired FST for the admixed individuals geography <- match.arg(geography) new_model(name = "ggmix_dec_5_2019", label = sprintf("percent_causal = %s, percent_overlap = %s, eta = %s, sigma2 = %s, geography = %s, p_design = %s, p_kinship = %s, beta_mean = %s", percent_causal, percent_overlap, eta, sigma2, geography, p_design, p_kinship, beta_mean), params = list(n = n, p_design = p_design, p_kinship = p_kinship, k = k, s = s, Fst = Fst, b0 = b0, beta_mean = beta_mean, eta = eta, sigma2 = sigma2, geography = geography, percent_causal = percent_causal, percent_overlap = percent_overlap), simulate = function(n, p_design, p_kinship, k, s, Fst, b0, beta_mean, eta, sigma2, geography, percent_causal, percent_overlap, nsim) { models <- list() for (i in seq(nsim)) { ######################## ### Generate Kinship ### ######################## if (geography == "1d") { # browser() # k=10 FF <- 1:k # subpopulation FST vector, up to a scalar obj <- bnpsd::admix_prop_1d_linear(n_ind = n, k_subpops = k, bias_coeff = s, coanc_subpops = FF, fst = Fst)#, # sigma = 3) # Q <- obj$Q # FF <- obj$F # obj$admix_proportions admix_proportions <- obj$admix_proportions # rescaled inbreeding vector for intermediate subpopulations inbr_subpops <- obj$coanc_subpops # get pop structure parameters of the admixed individuals coancestry <- coanc_admix(admix_proportions, inbr_subpops) kinship <- coanc_to_kinship(coancestry) # popkin::plot_popkin(kinship) # # k=2 # obj <- bnpsd::admix_prop_1d_linear(n_ind = n, # k_subpops = k, # # bias_coeff = s, # # coanc_subpops = FF, # # fst = Fst, # sigma = 4) # # sigma = 3) # # Q <- obj$Q # # FF <- obj$F # # obj$admix_proportions # admix_proportions <- obj # # rescaled inbreeding vector for intermediate subpopulations # # inbr_subpops <- obj$coanc_subpops # coanc_subpops <- base::sample(c(0.1,0.3),size = 2, replace = FALSE) # # get pop structure parameters of the admixed individuals # coancestry <- coanc_admix(admix_proportions, coanc_subpops = coanc_subpops) # kinship <- coanc_to_kinship(coancestry) # popkin::plot_popkin(kinship) } else if (geography == "ind") { n1 <- n2 <- n3 <- n4 <- n5 <- 200*2 # here’s the labels (for simplicity, list all individuals of S1 first, then S2, then S3) labs <- c( rep.int("S1", n1), rep.int("S2", n2), rep.int("S3", n3), rep.int("S4", n4), rep.int("S5", n5)) # data dimensions infered from labs: length(labs) # number of individuals "n" # train # desired admixture matrix ("is" stands for "Independent Subpopulations") # Q <- bnpsd::admix_prop_indep_subpops(labs) # FF <- 1:k # subpopulation FST vector, unnormalized so far # FF <- FF/popkin::fst(FF)*Fst # normalized to have the desired Fst # number of subpopulations "k_subpops" k_subpops <- length(unique(labs)) # desired admixture matrix admix_proportions <- admix_prop_indep_subpops(labs) # subpopulation FST vector, unnormalized so far inbr_subpops <- 1 : k_subpops # normalized to have the desired FST # NOTE fst is a function in the `popkin` package inbr_subpops <- inbr_subpops / popkin::fst(inbr_subpops) * Fst # verify FST for the intermediate subpopulations # fst(inbr_subpops) #> [1] 0.2 # get coancestry of the admixed individuals coancestry <- coanc_admix(admix_proportions, inbr_subpops) # before getting FST for individuals, weigh then inversely proportional to subpop sizes weights <- popkin::weights_subpops(labs) # function from `popkin` package kinship <- coanc_to_kinship(coancestry) } else if (geography == "circ") { FF <- 1:k # subpopulation FST vector, up to a scalar # obj <- bnpsd::admix_prop_1d_circular(n_ind = n, k_subpops = k, s = s, F = FF, Fst = Fst) # Q <- obj$Q # FF <- obj$F # admixture proportions from *circular* 1D geography obj <- admix_prop_1d_circular( n_ind = n, k_subpops = k, bias_coeff = s, coanc_subpops = FF, fst = Fst ) admix_proportions <- obj$admix_proportions inbr_subpops <- obj$coanc_subpops # get pop structure parameters of the admixed individuals coancestry <- coanc_admix(admix_proportions, inbr_subpops) kinship <- coanc_to_kinship(coancestry) popkin::plot_popkin(kinship) } ##################### ### GENERATE SNPS ### ##################### ncausal <- p_design * percent_causal if (percent_overlap == "100") { total_snps_to_simulate <- p_design + p_kinship - ncausal # this contains all SNPs (X_{Design}:X_{kinship}) # out <- bnpsd::rbnpsd(Q = Q, F = FF, m = total_snps_to_simulate) # draw all random Allele Freqs (AFs) and genotypes # reuse the previous inbr_subpops, admix_proportions out <- bnpsd::draw_all_admix( admix_proportions = admix_proportions, inbr_subpops = inbr_subpops, m_loci = total_snps_to_simulate, # NOTE by default p_subpops and p_ind are not returned, but here we will ask for them want_p_subpops = TRUE, want_p_ind = TRUE ) Xall <- t(out$X) # genotypes are columns, rows are subjects cnames <- paste0("X", 1:total_snps_to_simulate) colnames(Xall) <- cnames rownames(Xall) <- paste0("id", 1:n) Xall[1:5,1:5] dim(Xall) subpops <- ceiling( (1:n)/n*k ) table(subpops) # got k=10 subpops with 100 individuals each ################### ### CAUSAL LOCI ### ################### # browser() # Snps used for kinship snps_kinships <- sample(cnames, p_kinship, replace = FALSE) # all causal snps are in kinship matrix if (percent_causal != 0 ) { # browser() # compute marginal allele frequencies p_anc_hat <- colMeans(Xall[,snps_kinships,drop=FALSE], na.rm = TRUE)/2 # select random SNPs! this performs the magic... # also runs additional checks causal_indexes <- simtrait:::select_loci(maf = p_anc_hat, m_causal = ncausal, maf_cut = 0.05) # draw random SNP coefficients for selected loci # causal_coeffs <- stats::rnorm(ncausal, 0, 1) causal <- snps_kinships[causal_indexes] snps_design <- c(setdiff(cnames, snps_kinships), causal) not_causal <- setdiff(snps_design, causal) # subset data to consider causal loci only p_anc_hat <- p_anc_hat[causal_indexes] } else if (percent_causal == 0) { causal <- "" snps_design <- setdiff(cnames, snps_kinships) not_causal <- snps_design } Xkinship <- Xall[,snps_kinships] Xdesign <- Xall[,snps_design] # Xdesign_causal <- Xall[,causal_indexes,drop=F] # the subset of causal data (keep as a matrix even if m_causal == 1) # now estimate kinship using popkin PhiHat <- popkin::popkin(Xkinship, subpops = subpops, loci_on_cols = TRUE) mean_kinship <- mean(PhiHat) # PhiHat <- popkin::popkin(Xkinship, lociOnCols = TRUE) } else if (percent_overlap == "0") { total_snps_to_simulate <- p_design + p_kinship # this contains all SNPs (X_{Testing}:X_{kinship}) # out <- bnpsd::rbnpsd(Q = Q, F = FF, m = total_snps_to_simulate) # draw all random Allele Freqs (AFs) and genotypes # reuse the previous inbr_subpops, admix_proportions out <- draw_all_admix( admix_proportions = admix_proportions, inbr_subpops = inbr_subpops, m_loci = total_snps_to_simulate, # NOTE by default p_subpops and p_ind are not returned, but here we will ask for them want_p_subpops = TRUE, want_p_ind = TRUE ) Xall <- t(out$X) # genotypes are columns, rows are subjects cnames <- paste0("X", 1:total_snps_to_simulate) colnames(Xall) <- cnames rownames(Xall) <- paste0("id", 1:n) Xall[1:5,1:5] dim(Xall) subpops <- ceiling( (1:n)/n*k ) table(subpops) # got k=10 subpops with 100 individuals each # Snps used for kinship snps_kinships <- sample(cnames, p_kinship, replace = FALSE) length(snps_kinships) snps_design <- setdiff(cnames, snps_kinships) # length(snps_design) # setdiff(cnames, snps_kinships) %>% length() if (percent_causal !=0) { # compute marginal allele frequencies p_anc_hat <- colMeans(Xall[,snps_design, drop=FALSE], na.rm = TRUE)/2 # select random SNPs! this performs the magic... # also runs additional checks causal_indexes <- simtrait:::select_loci(maf = p_anc_hat, m_causal = ncausal, maf_cut = 0.05) # draw random SNP coefficients for selected loci # causal_coeffs <- stats::rnorm(ncausal, 0, 1) causal <- snps_design[causal_indexes] # causal <- sample(snps_design, ncausal, replace = FALSE) } else if (percent_causal == 0) { causal <- "" } not_causal <- setdiff(snps_design, causal) Xkinship <- Xall[,snps_kinships] Xdesign <- Xall[,snps_design] # now estimate kinship using popkin PhiHat <- popkin::popkin(Xkinship, subpops = subpops, loci_on_cols = TRUE) # PhiHat <- popkin::popkin(Xkinship, lociOnCols = TRUE) } np <- dim(Xdesign) n <- np[[1]] p <- np[[2]] beta <- rep(0, length = p) if (percent_causal != 0) { # beta[which(colnames(Xdesign) %in% causal)] <- runif(n = length(causal), beta_mean - 0.3, beta_mean + 0.3) # beta[which(colnames(Xdesign) %in% causal)] <- c(2,4,3,3,1) beta[which(colnames(Xdesign) %in% causal)] <- rnorm(n = length(causal)) } ############# ### SCALE ### ############# # causal_coeffs <- beta[which(colnames(Xdesign) %in% causal)] # # # to scale causal_coeffs to give correct heritability, we need to estimate the pq = p(1-p) vector # # calculate pq = p_anc * (1 - p_anc) in one of two ways # # indirect, infer from genotypes and mean kinship # # recall E[ p_anc_hat * (1 - p_anc_hat) ] = pq * (1 - mean_kinship), so we solve for pq: # pq <- p_anc_hat * (1 - p_anc_hat) / (1 - mean_kinship) # # # the initial genetic variance is # sigma_0 <- sqrt( 2 * sum( pq * causal_coeffs^2 ) ) # # sigma_sq = sigma2 # herit = eta # # adjust causal_coeffss so final variance is sigma_sq*herit as desired! # # causal_coeffs <- causal_coeffs * sqrt( sigma_sq * herit ) / sigma_0 # scale by standard deviations # # G <- drop( Xdesign_causal %*% causal_coeffs ) # this is a vector # # NOTE by construction: # # Cov(G) = 2 * herit * kinship # # # works very well assuming causal_coeffs and p_anc are uncorrelated! # muXB <- 2 * sum( causal_coeffs ) * mean( p_anc_hat ) # # # in all cases: # # - remove the mean from the genotypes (muXB) # # - add the desired mean # mu = 0 # # G <- G - muXB + mu # # # draw noise # E <- stats::rnorm(n, 0, (1 - herit) * sigma_sq ) # noise has mean zero but variance ((1-herit) * sigma_sq) # # NOTE by construction: # # Cov(E) = (1-herit) * sigma_sq * I # # # lastly, here's the trait: # trait <- G + E mu <- as.numeric(Xdesign %*% beta) # y <- trait kin <- 2 * PhiHat tt <- eta * sigma2 * kin if (!all(eigen(tt)$values > 0)) { message("eta * sigma2 * kin not PD, using Matrix::nearPD") tt <- Matrix::nearPD(tt)$mat } P <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = tt) E <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n)) # y <- mu + sigma * matrix(rnorm(nsim * n), n, nsim) # y <- b0 + mu + t(P) + t(E) # y <- MASS::mvrnorm(1, mu = mu, Sigma = eta * sigma2 * kin + (1 - eta) * sigma2 * diag(n)) y <- b0 + mu + P + E # ind <- caret::createDataPartition(y, p = 0.5, list = FALSE)[,1] # partition the data into train/test/validate spec = c(train = .8, validate = .2) g = sample(cut( seq(nrow(Xdesign)), nrow(Xdesign)*cumsum(c(0,spec)), labels = names(spec) )) # g %>% table train_ind <- which(g == "train") validate_ind <- which(g == "validate") # test_ind <- which(g == "test") # res = split(admixed$x, g) xtrain <- Xdesign[train_ind,,drop=FALSE] # xtest <- Xdesign[test_ind,,drop=FALSE] xvalidate <- Xdesign[validate_ind,,drop=FALSE] ytrain <- y[train_ind] # ytest <- y[test_ind] yvalidate <- y[validate_ind] kin_train <- kin[train_ind,train_ind] # kin_test_train <- kin[test_ind,train_ind] kin_validate_train <- kin[validate_ind,train_ind] # xtrain <- Xdesign[ind,,drop=FALSE] # xtest <- Xdesign[-ind,,drop=FALSE] PC <- prcomp(xtrain) xtrain_lasso <- cbind(xtrain, PC$x[,1:10]) # xtest_pc <- predict(PC, newdata = xtest) # xtest_lasso <- cbind(xtest, xtest_pc[,1:10]) xvalidate_pc <- predict(PC, newdata = xvalidate) xvalidate_lasso <- cbind(xvalidate, xvalidate_pc[,1:10]) # kin_train <- kin[ind,ind] # kin_test_train <- kin[-ind,ind] # ytrain <- y[ind] # ytest <- y[-ind] mu_train <- mu[train_ind] models[[i]] <- list(ytrain = ytrain, # ytest = ytest, yvalidate = yvalidate, xtrain = xtrain, # xtest = xtest, xvalidate = xvalidate, xtrain_lasso = xtrain_lasso, # xtest_lasso = xtest_lasso, xvalidate_lasso = xvalidate_lasso, kin_train = kin_train, # kin_test_train = kin_test_train, # covaraince between train and test kin_validate_train = kin_validate_train, mu_train = mu_train, # Xbeta for training set causal = causal, beta = beta, not_causal = not_causal ) } return(models) }) } # this has three splits make_ADmixed_model_train_test_validate <- function(n, p_design, p_kinship, k, s, Fst, b0, beta_mean, eta, sigma2, geography = c("ind", "1d","circ"), percent_causal, percent_overlap) { # p_design: number of variables in X_design, i.e., the design matrix # p_kinship: number of variable in X_kinship, i.e., matrix used to calculate kinship # k: Number of intermediate subpopulations # s: The desired bias coefficient, which specifies σ indirectly. Required if sigma is missing # F: The length-k vector of inbreeding coefficients (or FST's) of the intermediate subpopulations, # up to a scaling factor (which cancels out in calculations). Required if sigma is missing # Fst: The desired final FST of the admixed individuals. Required if sigma is missing # browser() # define population structure # s <- 0.5 # desired bias coefficient # Fst <- 0.1 # desired FST for the admixed individuals geography <- match.arg(geography) new_model(name = "ggmix_07_08_2019", label = sprintf("percent_causal = %s, percent_overlap = %s, eta = %s, sigma2 = %s, geography = %s, p_design = %s, p_kinship = %s, beta_mean = %s", percent_causal, percent_overlap, eta, sigma2, geography, p_design, p_kinship, beta_mean), params = list(n = n, p_design = p_design, p_kinship = p_kinship, k = k, s = s, Fst = Fst, b0 = b0, beta_mean = beta_mean, eta = eta, sigma2 = sigma2, geography = geography, percent_causal = percent_causal, percent_overlap = percent_overlap), simulate = function(n, p_design, p_kinship, k, s, Fst, b0, beta_mean, eta, sigma2, geography, percent_causal, percent_overlap, nsim) { models <- list() for (i in seq(nsim)) { ######################## ### Generate Kinship ### ######################## if (geography == "1d") { # browser() # k=10 FF <- 1:k # subpopulation FST vector, up to a scalar obj <- bnpsd::admix_prop_1d_linear(n_ind = n, k_subpops = k, bias_coeff = s, coanc_subpops = FF, fst = Fst)#, # sigma = 3) # Q <- obj$Q # FF <- obj$F # obj$admix_proportions admix_proportions <- obj$admix_proportions # rescaled inbreeding vector for intermediate subpopulations inbr_subpops <- obj$coanc_subpops # get pop structure parameters of the admixed individuals coancestry <- coanc_admix(admix_proportions, inbr_subpops) kinship <- coanc_to_kinship(coancestry) # popkin::plot_popkin(kinship) # # k=2 # obj <- bnpsd::admix_prop_1d_linear(n_ind = n, # k_subpops = k, # # bias_coeff = s, # # coanc_subpops = FF, # # fst = Fst, # sigma = 4) # # sigma = 3) # # Q <- obj$Q # # FF <- obj$F # # obj$admix_proportions # admix_proportions <- obj # # rescaled inbreeding vector for intermediate subpopulations # # inbr_subpops <- obj$coanc_subpops # coanc_subpops <- base::sample(c(0.1,0.3),size = 2, replace = FALSE) # # get pop structure parameters of the admixed individuals # coancestry <- coanc_admix(admix_proportions, coanc_subpops = coanc_subpops) # kinship <- coanc_to_kinship(coancestry) # popkin::plot_popkin(kinship) } else if (geography == "ind") { n1 <- n2 <- n3 <- n4 <- n5 <- 200*2 # here’s the labels (for simplicity, list all individuals of S1 first, then S2, then S3) labs <- c( rep.int("S1", n1), rep.int("S2", n2), rep.int("S3", n3), rep.int("S4", n4), rep.int("S5", n5)) # data dimensions infered from labs: length(labs) # number of individuals "n" # train # desired admixture matrix ("is" stands for "Independent Subpopulations") # Q <- bnpsd::admix_prop_indep_subpops(labs) # FF <- 1:k # subpopulation FST vector, unnormalized so far # FF <- FF/popkin::fst(FF)*Fst # normalized to have the desired Fst # number of subpopulations "k_subpops" k_subpops <- length(unique(labs)) # desired admixture matrix admix_proportions <- admix_prop_indep_subpops(labs) # subpopulation FST vector, unnormalized so far inbr_subpops <- 1 : k_subpops # normalized to have the desired FST # NOTE fst is a function in the `popkin` package inbr_subpops <- inbr_subpops / popkin::fst(inbr_subpops) * Fst # verify FST for the intermediate subpopulations # fst(inbr_subpops) #> [1] 0.2 # get coancestry of the admixed individuals coancestry <- coanc_admix(admix_proportions, inbr_subpops) # before getting FST for individuals, weigh then inversely proportional to subpop sizes weights <- popkin::weights_subpops(labs) # function from `popkin` package kinship <- coanc_to_kinship(coancestry) } else if (geography == "circ") { FF <- 1:k # subpopulation FST vector, up to a scalar # obj <- bnpsd::admix_prop_1d_circular(n_ind = n, k_subpops = k, s = s, F = FF, Fst = Fst) # Q <- obj$Q # FF <- obj$F # admixture proportions from *circular* 1D geography obj <- admix_prop_1d_circular( n_ind = n, k_subpops = k, bias_coeff = s, coanc_subpops = FF, fst = Fst ) admix_proportions <- obj$admix_proportions inbr_subpops <- obj$coanc_subpops # get pop structure parameters of the admixed individuals coancestry <- coanc_admix(admix_proportions, inbr_subpops) kinship <- coanc_to_kinship(coancestry) popkin::plot_popkin(kinship) } ##################### ### GENERATE SNPS ### ##################### ncausal <- p_design * percent_causal if (percent_overlap == "100") { total_snps_to_simulate <- p_design + p_kinship - ncausal # this contains all SNPs (X_{Design}:X_{kinship}) # out <- bnpsd::rbnpsd(Q = Q, F = FF, m = total_snps_to_simulate) # draw all random Allele Freqs (AFs) and genotypes # reuse the previous inbr_subpops, admix_proportions out <- bnpsd::draw_all_admix( admix_proportions = admix_proportions, inbr_subpops = inbr_subpops, m_loci = total_snps_to_simulate, # NOTE by default p_subpops and p_ind are not returned, but here we will ask for them want_p_subpops = TRUE, want_p_ind = TRUE ) Xall <- t(out$X) # genotypes are columns, rows are subjects cnames <- paste0("X", 1:total_snps_to_simulate) colnames(Xall) <- cnames rownames(Xall) <- paste0("id", 1:n) Xall[1:5,1:5] dim(Xall) subpops <- ceiling( (1:n)/n*k ) table(subpops) # got k=10 subpops with 100 individuals each ################### ### CAUSAL LOCI ### ################### # browser() # Snps used for kinship snps_kinships <- sample(cnames, p_kinship, replace = FALSE) # all causal snps are in kinship matrix if (percent_causal != 0 ) { # browser() # compute marginal allele frequencies p_anc_hat <- colMeans(Xall[,snps_kinships,drop=FALSE], na.rm = TRUE)/2 # select random SNPs! this performs the magic... # also runs additional checks causal_indexes <- simtrait:::select_loci(maf = p_anc_hat, m_causal = ncausal, maf_cut = 0.05) # draw random SNP coefficients for selected loci # causal_coeffs <- stats::rnorm(ncausal, 0, 1) causal <- snps_kinships[causal_indexes] snps_design <- c(setdiff(cnames, snps_kinships), causal) not_causal <- setdiff(snps_design, causal) # subset data to consider causal loci only p_anc_hat <- p_anc_hat[causal_indexes] } else if (percent_causal == 0) { causal <- "" snps_design <- setdiff(cnames, snps_kinships) not_causal <- snps_design } Xkinship <- Xall[,snps_kinships] Xdesign <- Xall[,snps_design] # Xdesign_causal <- Xall[,causal_indexes,drop=F] # the subset of causal data (keep as a matrix even if m_causal == 1) # now estimate kinship using popkin PhiHat <- popkin::popkin(Xkinship, subpops = subpops, loci_on_cols = TRUE) mean_kinship <- mean(PhiHat) # PhiHat <- popkin::popkin(Xkinship, lociOnCols = TRUE) } else if (percent_overlap == "0") { total_snps_to_simulate <- p_design + p_kinship # this contains all SNPs (X_{Testing}:X_{kinship}) # out <- bnpsd::rbnpsd(Q = Q, F = FF, m = total_snps_to_simulate) # draw all random Allele Freqs (AFs) and genotypes # reuse the previous inbr_subpops, admix_proportions out <- draw_all_admix( admix_proportions = admix_proportions, inbr_subpops = inbr_subpops, m_loci = total_snps_to_simulate, # NOTE by default p_subpops and p_ind are not returned, but here we will ask for them want_p_subpops = TRUE, want_p_ind = TRUE ) Xall <- t(out$X) # genotypes are columns, rows are subjects cnames <- paste0("X", 1:total_snps_to_simulate) colnames(Xall) <- cnames rownames(Xall) <- paste0("id", 1:n) Xall[1:5,1:5] dim(Xall) subpops <- ceiling( (1:n)/n*k ) table(subpops) # got k=10 subpops with 100 individuals each # Snps used for kinship snps_kinships <- sample(cnames, p_kinship, replace = FALSE) length(snps_kinships) snps_design <- setdiff(cnames, snps_kinships) # length(snps_design) # setdiff(cnames, snps_kinships) %>% length() if (percent_causal !=0) { # compute marginal allele frequencies p_anc_hat <- colMeans(Xall[,snps_design, drop=FALSE], na.rm = TRUE)/2 # select random SNPs! this performs the magic... # also runs additional checks causal_indexes <- simtrait:::select_loci(maf = p_anc_hat, m_causal = ncausal, maf_cut = 0.05) # draw random SNP coefficients for selected loci # causal_coeffs <- stats::rnorm(ncausal, 0, 1) causal <- snps_design[causal_indexes] # causal <- sample(snps_design, ncausal, replace = FALSE) } else if (percent_causal == 0) { causal <- "" } not_causal <- setdiff(snps_design, causal) Xkinship <- Xall[,snps_kinships] Xdesign <- Xall[,snps_design] # now estimate kinship using popkin PhiHat <- popkin::popkin(Xkinship, subpops = subpops, loci_on_cols = TRUE) # PhiHat <- popkin::popkin(Xkinship, lociOnCols = TRUE) } np <- dim(Xdesign) n <- np[[1]] p <- np[[2]] beta <- rep(0, length = p) if (percent_causal != 0) { # beta[which(colnames(Xdesign) %in% causal)] <- runif(n = length(causal), beta_mean - 0.3, beta_mean + 0.3) # beta[which(colnames(Xdesign) %in% causal)] <- c(2,4,3,3,1) beta[which(colnames(Xdesign) %in% causal)] <- rnorm(n = length(causal)) } ############# ### SCALE ### ############# # causal_coeffs <- beta[which(colnames(Xdesign) %in% causal)] # # # to scale causal_coeffs to give correct heritability, we need to estimate the pq = p(1-p) vector # # calculate pq = p_anc * (1 - p_anc) in one of two ways # # indirect, infer from genotypes and mean kinship # # recall E[ p_anc_hat * (1 - p_anc_hat) ] = pq * (1 - mean_kinship), so we solve for pq: # pq <- p_anc_hat * (1 - p_anc_hat) / (1 - mean_kinship) # # # the initial genetic variance is # sigma_0 <- sqrt( 2 * sum( pq * causal_coeffs^2 ) ) # # sigma_sq = sigma2 # herit = eta # # adjust causal_coeffss so final variance is sigma_sq*herit as desired! # # causal_coeffs <- causal_coeffs * sqrt( sigma_sq * herit ) / sigma_0 # scale by standard deviations # # G <- drop( Xdesign_causal %*% causal_coeffs ) # this is a vector # # NOTE by construction: # # Cov(G) = 2 * herit * kinship # # # works very well assuming causal_coeffs and p_anc are uncorrelated! # muXB <- 2 * sum( causal_coeffs ) * mean( p_anc_hat ) # # # in all cases: # # - remove the mean from the genotypes (muXB) # # - add the desired mean # mu = 0 # # G <- G - muXB + mu # # # draw noise # E <- stats::rnorm(n, 0, (1 - herit) * sigma_sq ) # noise has mean zero but variance ((1-herit) * sigma_sq) # # NOTE by construction: # # Cov(E) = (1-herit) * sigma_sq * I # # # lastly, here's the trait: # trait <- G + E mu <- as.numeric(Xdesign %*% beta) # y <- trait kin <- 2 * PhiHat tt <- eta * sigma2 * kin if (!all(eigen(tt)$values > 0)) { message("eta * sigma2 * kin not PD, using Matrix::nearPD") tt <- Matrix::nearPD(tt)$mat } P <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = tt) E <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n)) # y <- mu + sigma * matrix(rnorm(nsim * n), n, nsim) # y <- b0 + mu + t(P) + t(E) # y <- MASS::mvrnorm(1, mu = mu, Sigma = eta * sigma2 * kin + (1 - eta) * sigma2 * diag(n)) y <- b0 + mu + P + E # ind <- caret::createDataPartition(y, p = 0.5, list = FALSE)[,1] # partition the data into train/test/validate spec = c(train = .6, test = .2, validate = .2) g = sample(cut( seq(nrow(Xdesign)), nrow(Xdesign)*cumsum(c(0,spec)), labels = names(spec) )) # g %>% table train_ind <- which(g == "train") validate_ind <- which(g == "validate") test_ind <- which(g == "test") # res = split(admixed$x, g) xtrain <- Xdesign[train_ind,,drop=FALSE] xtest <- Xdesign[test_ind,,drop=FALSE] xvalidate <- Xdesign[validate_ind,,drop=FALSE] ytrain <- y[train_ind] ytest <- y[test_ind] yvalidate <- y[validate_ind] kin_train <- kin[train_ind,train_ind] kin_test_train <- kin[test_ind,train_ind] kin_validate_train <- kin[validate_ind,train_ind] # xtrain <- Xdesign[ind,,drop=FALSE] # xtest <- Xdesign[-ind,,drop=FALSE] PC <- prcomp(xtrain) xtrain_lasso <- cbind(xtrain, PC$x[,1:10]) xtest_pc <- predict(PC, newdata = xtest) xtest_lasso <- cbind(xtest, xtest_pc[,1:10]) xvalidate_pc <- predict(PC, newdata = xvalidate) xvalidate_lasso <- cbind(xvalidate, xvalidate_pc[,1:10]) # kin_train <- kin[ind,ind] # kin_test_train <- kin[-ind,ind] # ytrain <- y[ind] # ytest <- y[-ind] mu_train <- mu[train_ind] models[[i]] <- list(ytrain = ytrain, ytest = ytest, yvalidate = yvalidate, xtrain = xtrain, xtest = xtest, xvalidate = xvalidate, xtrain_lasso = xtrain_lasso, xtest_lasso = xtest_lasso, xvalidate_lasso = xvalidate_lasso, kin_train = kin_train, kin_test_train = kin_test_train, # covaraince between train and test kin_validate_train = kin_validate_train, mu_train = mu_train, # Xbeta for training set causal = causal, beta = beta, not_causal = not_causal ) } return(models) }) } make_ADmixed_model_simtrait <- function(n, p_design, p_kinship, k, s, Fst, b0, beta_mean, eta, sigma2, geography = c("ind", "1d","circ"), percent_causal, percent_overlap) { # p_design: number of variables in X_design, i.e., the design matrix # p_kinship: number of variable in X_kinship, i.e., matrix used to calculate kinship # k: Number of intermediate subpopulations # s: The desired bias coefficient, which specifies σ indirectly. Required if sigma is missing # F: The length-k vector of inbreeding coefficients (or FST's) of the intermediate subpopulations, # up to a scaling factor (which cancels out in calculations). Required if sigma is missing # Fst: The desired final FST of the admixed individuals. Required if sigma is missing # browser() # define population structure # s <- 0.5 # desired bias coefficient # Fst <- 0.1 # desired FST for the admixed individuals geography <- match.arg(geography) new_model(name = "ggmix_05_07_2019", label = sprintf("percent_causal = %s, percent_overlap = %s, eta = %s, sigma2 = %s, geography = %s, p_design = %s, p_kinship = %s, beta_mean = %s", percent_causal, percent_overlap, eta, sigma2, geography, p_design, p_kinship, beta_mean), params = list(n = n, p_design = p_design, p_kinship = p_kinship, k = k, s = s, Fst = Fst, b0 = b0, beta_mean = beta_mean, eta = eta, sigma2 = sigma2, geography = geography, percent_causal = percent_causal, percent_overlap = percent_overlap), simulate = function(n, p_design, p_kinship, k, s, Fst, b0, beta_mean, eta, sigma2, geography, percent_causal, percent_overlap, nsim) { models <- list() for (i in seq(nsim)) { if (geography == "1d") { FF <- 1:k # subpopulation FST vector, up to a scalar obj <- bnpsd::admix_prop_1d_linear(n_ind = n, k_subpops = k, bias_coeff = s, coanc_subpops = FF, fst = Fst) # Q <- obj$Q # FF <- obj$F admix_proportions <- obj$admix_proportions # rescaled inbreeding vector for intermediate subpopulations inbr_subpops <- obj$coanc_subpops # get pop structure parameters of the admixed individuals coancestry <- coanc_admix(admix_proportions, inbr_subpops) kinship <- coanc_to_kinship(coancestry) } else if (geography == "ind") { n1 <- n2 <- n3 <- n4 <- n5 <- 200*2 # here’s the labels (for simplicity, list all individuals of S1 first, then S2, then S3) labs <- c( rep.int("S1", n1), rep.int("S2", n2), rep.int("S3", n3), rep.int("S4", n4), rep.int("S5", n5)) # data dimensions infered from labs: length(labs) # number of individuals "n" # train # desired admixture matrix ("is" stands for "Independent Subpopulations") # Q <- bnpsd::admix_prop_indep_subpops(labs) # FF <- 1:k # subpopulation FST vector, unnormalized so far # FF <- FF/popkin::fst(FF)*Fst # normalized to have the desired Fst # number of subpopulations "k_subpops" k_subpops <- length(unique(labs)) # desired admixture matrix admix_proportions <- admix_prop_indep_subpops(labs) # subpopulation FST vector, unnormalized so far inbr_subpops <- 1 : k_subpops # normalized to have the desired FST # NOTE fst is a function in the `popkin` package inbr_subpops <- inbr_subpops / popkin::fst(inbr_subpops) * Fst # verify FST for the intermediate subpopulations # fst(inbr_subpops) #> [1] 0.2 # get coancestry of the admixed individuals coancestry <- coanc_admix(admix_proportions, inbr_subpops) # before getting FST for individuals, weigh then inversely proportional to subpop sizes weights <- popkin::weights_subpops(labs) # function from `popkin` package kinship <- coanc_to_kinship(coancestry) } else if (geography == "circ") { FF <- 1:k # subpopulation FST vector, up to a scalar # obj <- bnpsd::admix_prop_1d_circular(n_ind = n, k_subpops = k, s = s, F = FF, Fst = Fst) # Q <- obj$Q # FF <- obj$F # admixture proportions from *circular* 1D geography obj <- admix_prop_1d_circular( n_ind = n, k_subpops = k, bias_coeff = s, coanc_subpops = FF, fst = Fst ) admix_proportions <- obj$admix_proportions inbr_subpops <- obj$coanc_subpops # get pop structure parameters of the admixed individuals coancestry <- coanc_admix(admix_proportions, inbr_subpops) kinship <- coanc_to_kinship(coancestry) } total_snps_to_simulate <- p_design # draw all random Allele Freqs (AFs) and genotypes # reuse the previous inbr_subpops, admix_proportions out <- bnpsd::draw_all_admix( admix_proportions = admix_proportions, inbr_subpops = inbr_subpops, m_loci = total_snps_to_simulate, # NOTE by default p_subpops and p_ind are not returned, but here we will ask for them want_p_subpops = TRUE, want_p_ind = TRUE ) Xall <- t(out$X) # genotypes are columns, rows are subjects cnames <- paste0("X", 1:total_snps_to_simulate) colnames(Xall) <- cnames rownames(Xall) <- paste0("id", 1:n) subpops <- ceiling( (1:n)/n*k ) # ancestral allele frequencies p_anc <- out$p_anc np <- dim(Xall) n <- np[[1]] p <- np[[2]] # parameters of simulation m_causal <- p_design * percent_causal herit <- eta # create simulated trait and associated data # version 1: known p_anc (prefered, only applicable to simulated data) obj <- sim_trait(X = out$X, m_causal = m_causal, herit = herit, p_anc = p_anc) # version 2: known kinship (more broadly applicable but fewer guarantees) # obj <- sim_trait(X = X, m_causal = m_causal, herit = herit, kinship = kinship) # outputs in both versions: # trait vector y <- obj$y # randomly-picked causal locus index causal <- cnames[obj$i] not_causal <- setdiff(cnames, causal) # locus effect size vector beta <- obj$beta # theoretical covariance of the simulated traits # kin <- cov_trait(kinship = kinship, herit = herit) kin <- kinship # partition the data into train/test/validate spec = c(train = .6, test = .2, validate = .2) g = sample(cut( seq(nrow(Xall)), nrow(Xall)*cumsum(c(0,spec)), labels = names(spec) )) # g %>% table Xdesign <- Xall train_ind <- which(g == "train") validate_ind <- which(g == "validate") test_ind <- which(g == "test") # res = split(admixed$x, g) xtrain <- Xdesign[train_ind,,drop=FALSE] xtest <- Xdesign[test_ind,,drop=FALSE] xvalidate <- Xdesign[validate_ind,,drop=FALSE] ytrain <- y[train_ind] ytest <- y[test_ind] yvalidate <- y[validate_ind] kin_train <- kin[train_ind,train_ind] kin_test_train <- kin[test_ind,train_ind] kin_validate_train <- kin[validate_ind,train_ind] # xtrain <- Xdesign[ind,,drop=FALSE] # xtest <- Xdesign[-ind,,drop=FALSE] PC <- prcomp(xtrain) xtrain_lasso <- cbind(xtrain, PC$x[,1:10]) xtest_pc <- predict(PC, newdata = xtest) xtest_lasso <- cbind(xtest, xtest_pc[,1:10]) xvalidate_pc <- predict(PC, newdata = xvalidate) xvalidate_lasso <- cbind(xvalidate, xvalidate_pc[,1:10]) # kin_train <- kin[ind,ind] # kin_test_train <- kin[-ind,ind] # ytrain <- y[ind] # ytest <- y[-ind] mu_train <- (xtrain[,obj$i,drop = FALSE] %*% beta)[train_ind] models[[i]] <- list(ytrain = ytrain, ytest = ytest, yvalidate = yvalidate, xtrain = xtrain, xtest = xtest, xvalidate = xvalidate, xtrain_lasso = xtrain_lasso, xtest_lasso = xtest_lasso, xvalidate_lasso = xvalidate_lasso, kin_train = kin_train, kin_test_train = kin_test_train, # covaraince between train and test kin_validate_train = kin_validate_train, mu_train = mu_train, # Xbeta for training set causal = causal, beta = beta, not_causal = not_causal ) } return(models) }) } make_ADmixed_model_not_sim <- function(n, p_test, p_kinship, k, s, Fst, b0, beta_mean, eta, sigma2, geography = c("ind", "1d","circ"), percent_causal, percent_overlap) { # p_test: number of variables in X_test, i.e., the design matrix # p_kinship: number of variable in X_kinship, i.e., matrix used to calculate kinship # k: Number of intermediate subpopulations # s: The desired bias coefficient, which specifies σ indirectly. Required if sigma is missing # F: The length-k vector of inbreeding coefficients (or FST's) of the intermediate subpopulations, # up to a scaling factor (which cancels out in calculations). Required if sigma is missing # Fst: The desired final FST of the admixed individuals. Required if sigma is missing # browser() # define population structure FF <- 1:k # subpopulation FST vector, up to a scalar # s <- 0.5 # desired bias coefficient # Fst <- 0.1 # desired FST for the admixed individuals geography <- match.arg(geography) if (geography == "1d") { obj <- bnpsd::q1d(n = n, k = k, s = s, F = FF, Fst = Fst) Q <- obj$Q FF <- obj$F } else if (geography == "ind") { n_train_k <- n / k n1 <- n_train_k; n2 <- n_train_k; n3 <- n_train_k; n4 <- n_train_k; n5 <- n_train_k # here’s the labels (for simplicity, list all individuals of S1 first, then S2, then S3) labs <- c( rep.int("S1", n1), rep.int("S2", n2), rep.int("S3", n3), rep.int("S4", n4), rep.int("S5", n5)) # data dimensions infered from labs: length(labs) # number of individuals "n" # desired admixture matrix ("is" stands for "Independent Subpopulations") Q <- bnpsd::qis(labs) FF <- 1:k # subpopulation FST vector, unnormalized so far FF <- FF/popkin::fst(FF)*Fst # normalized to have the desired Fst } else if (geography == "circ") { obj <- bnpsd::q1dc(n = n, k = k, s = s, F = FF, Fst = Fst) Q <- obj$Q FF <- obj$F } ncausal <- p_test * percent_causal # browser() if (percent_overlap == "100") { total_snps_to_simulate <- p_test + p_kinship - ncausal # this contains all SNPs (X_{Testing}:X_{kinship}) out <- bnpsd::rbnpsd(Q, FF, total_snps_to_simulate) Xall <- t(out$X) # genotypes are columns, rows are subjects cnames <- paste0("X", 1:total_snps_to_simulate) colnames(Xall) <- cnames rownames(Xall) <- paste0("id", 1:n) Xall[1:5,1:5] dim(Xall) subpops <- ceiling( (1:n)/n*k ) table(subpops) # got k=10 subpops with 100 individuals each # Snps used for kinship snps_kinships <- sample(cnames, p_kinship, replace = FALSE) length(snps_kinships) # all causal snps are in kinship matrix if (percent_causal != 0 ) { causal <- sample(snps_kinships, ncausal, replace = FALSE) snps_design <- c(setdiff(cnames, snps_kinships), causal) not_causal <- setdiff(snps_design, causal) } else if (percent_causal == 0) { causal <- "" snps_design <- setdiff(cnames, snps_kinships) not_causal <- snps_design } # length(snps_design) # setdiff(cnames, snps_kinships) %>% length() # browser() Xkinship <- Xall[,snps_kinships] Xtest <- Xall[,snps_design] # now estimate kinship using popkin # PhiHat <- popkin::popkin(X, subpops, lociOnCols = TRUE) PhiHat <- popkin::popkin(Xkinship, lociOnCols = TRUE) } else if (percent_overlap == "0") { total_snps_to_simulate <- p_test + p_kinship # this contains all SNPs (X_{Testing}:X_{kinship}) out <- bnpsd::rbnpsd(Q, FF, total_snps_to_simulate) Xall <- t(out$X) # genotypes are columns, rows are subjects cnames <- paste0("X", 1:total_snps_to_simulate) colnames(Xall) <- cnames rownames(Xall) <- paste0("id", 1:n) Xall[1:5,1:5] dim(Xall) subpops <- ceiling( (1:n)/n*k ) table(subpops) # got k=10 subpops with 100 individuals each # Snps used for kinship snps_kinships <- sample(cnames, p_kinship, replace = FALSE) length(snps_kinships) snps_design <- setdiff(cnames, snps_kinships) # length(snps_design) # setdiff(cnames, snps_kinships) %>% length() if (percent_causal !=0) { causal <- sample(snps_design, ncausal, replace = FALSE) } else if (percent_causal == 0) { causal <- "" } not_causal <- setdiff(snps_design, causal) Xkinship <- Xall[,snps_kinships] Xtest <- Xall[,snps_design] # now estimate kinship using popkin # PhiHat <- popkin::popkin(X, subpops, lociOnCols = TRUE) PhiHat <- popkin::popkin(Xkinship, lociOnCols = TRUE) } kin <- 2 * PhiHat eiK <- eigen(kin) if (any(eiK$values < 1e-5)) { eiK$values[ eiK$values < 1e-5 ] <- 1e-5 } PC <- sweep(eiK$vectors, 2, sqrt(eiK$values), "*") # plot(eiK$values) # plot(PC[,1],PC[,2], pch = 19, col = rep(RColorBrewer::brewer.pal(5,"Paired"), each = 200)) np <- dim(Xtest) n <- np[[1]] p <- np[[2]] x_lasso <- cbind(Xtest,PC[,1:10]) x_lasso[1:5,1:5] # kin <- snpStats::xxt(dat$genotypes)/p beta <- rep(0, length = p) if (percent_causal != 0) { beta[which(colnames(Xtest) %in% causal)] <- runif(n = length(causal), beta_mean - 0.3, beta_mean + 0.3) } # beta[which(colnames(Xtest) %in% causal)] <- rnorm(n = length(causal)) mu <- as.numeric(Xtest %*% beta) tt <- eta * sigma2 * kin if (!all(eigen(tt)$values > 0)) { message("eta * sigma2 * kin not PD, using Matrix::nearPD") tt <- Matrix::nearPD(tt)$mat } P <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = tt) E <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n)) # y <- mu + sigma * matrix(rnorm(nsim * n), n, nsim) # y <- b0 + mu + t(P) + t(E) # y <- MASS::mvrnorm(1, mu = mu, Sigma = eta * sigma2 * kin + (1 - eta) * sigma2 * diag(n)) y <- b0 + mu + P + E models <- list(y = y, Xtest = Xtest, causal = causal, beta = beta, kin = kin, mu = mu, Xkinship = Xkinship, not_causal = not_causal, x_lasso = x_lasso) return(models) } ## @knitr models-not-used make_mixed_model_SSC <- function(b0, beta_mean, eta, sigma2, percent_causal, percent_overlap) { if (percent_causal == 1) { file_paths <- switch(percent_overlap, `0` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_8k.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_8k.rel.id", #X_phi is bed files used to make kinship X_Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_other_8k", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_1k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_10") }, `100` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_7990_causal_10.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_7990_causal_10.rel.id", #X_phi is bed files used to make kinship X_Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_other_7990_causal_10", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_1k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_10") }) } else if (percent_causal == 2.5) { file_paths <- switch(percent_overlap, `0` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_4k.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_4k.rel.id", #X_phi is bed files used to make kinship X_Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_other_4k", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_100") }, `100` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_7900_causal_100.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_7900_causal_100.rel.id", #X_phi is bed files used to make kinship X_Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_other_7900_causal_100", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_100") }) } else if (percent_causal == 10) { file_paths <- switch(percent_overlap, `0` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_4k.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_4k.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_400") }, `50` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3800_causal_200.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3800_causal_200.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_400") }, `100` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3600_causal_400.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3600_causal_400.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_400") }) } else if (percent_causal == 50) { file_paths <- switch(percent_overlap, `0` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_4k.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_4k.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_2000") }, `50` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3000_causal_1000.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3000_causal_1000.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_2000") }, `100` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_2000_causal_2000.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_2000_causal_2000.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_2000") }) } # gaston kinship Phi <- gaston::read.bed.matrix(file_paths$X_Phi) kin <- gaston::GRM(Phi, autosome.only = FALSE) kin[1:5,1:5] all(complete.cases(kin)) eiK <- eigen(kin) # all(rownames(as.matrix(x))==rownames(kin)) # deal with a small negative eigen value any(eiK$values < 0) eiK$values[ eiK$values < 0 ] <- 0 PC <- sweep(eiK$vectors, 2, sqrt(eiK$values), "*") dat <- gaston::read.bed.matrix(file_paths$bedfile) gaston::standardize(dat) <- "p" X <- as.matrix(dat) any(is.na(X)) X[is.na(X)] <- 0 any(is.na(X)) X[1:5,1:5] # need to re-order all(rownames(X)==rownames(kin)) all(rownames(X) %in% rownames(kin)) X <- X[match(rownames(kin), rownames(X)),] all(rownames(X)==rownames(kin)) np <- dim(X) n <- np[[1]] p <- np[[2]] all(rownames(X)==rownames(kin)) x_lasso <- cbind(X,PC[,1:10]) x_lasso[1:5,1:5] causal <- data.table::fread(file_paths$causal_list, header = FALSE)$V1 not_causal <- setdiff(colnames(X), causal) beta <- rep(0, length = p) beta[which(colnames(X) %in% causal)] <- runif(n = length(causal), beta_mean - 0.1, beta_mean + 0.1) mu <- as.numeric(X %*% beta) new_model(name = "ggmixSSCv3", label = sprintf("percent_causal = %s, percent_overlap = %s, eta = %s, sigma = %s, beta_mean = %s", percent_causal, percent_overlap, eta, sigma2, beta_mean), params = list(mu = mu, n = n, x = X, x_lasso = x_lasso, beta = beta, percent_causal = percent_causal, percent_overlap = percent_overlap, not_causal = not_causal, kin = kin, b0 = b0, sigma2 = sigma2, eta = eta, causal = causal, beta_mean = beta_mean), simulate = function(mu, sigma2, eta, kin, n, nsim) { P <- MASS::mvrnorm(nsim, mu = rep(0, n), Sigma = eta * sigma2 * kin) E <- MASS::mvrnorm(nsim, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n)) y <- b0 + mu + t(P) + t(E) return(split(y, col(y))) # make each col its own list element }) } make_sparse_linear_model <- function(b0, eta, sigma2, type) { file_paths <- switch(type, causal_400 = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3600_causal_400.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3600_causal_400.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_400") }, causal_2k = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_2000_causal_2000.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_2000_causal_2000.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_2000") }, causal_4k = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_causal_4000.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_causal_4000.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_4000") }) kin <- as.matrix(read.table(file_paths$Phi)) kin_names <- data.table::fread(file_paths$Phi_names) dimnames(kin)[[1]] <- kin_names$V1 dimnames(kin)[[2]] <- kin_names$V1 dat <- snpStats::read.plink(file_paths$bedfile) x <- as(dat$genotypes, "numeric") np <- dim(x) n <- np[[1]] p <- np[[2]] causal <- data.table::fread(file_paths$causal_list, header = FALSE)$V1 not_causal <- setdiff(colnames(x), causal) beta <- rep(0, length = p) beta[which(colnames(x) %in% causal)] <- rnorm(n = length(causal)) mu <- as.numeric(x %*% beta) # kin <- as.matrix(read.table(file_paths$Phi))[1:200,1:200] # kin_names <- data.table::fread(file_paths$Phi_names) # dimnames(kin)[[1]] <- kin_names$V1[1:200] # dimnames(kin)[[2]] <- kin_names$V1[1:200] # dat <- snpStats::read.plink(file_paths$bedfile) # x <- as(dat$genotypes, "numeric")[1:200,1:100] # np <- dim(x) # n <- np[[1]] # p <- np[[2]] # causal <- colnames(x)[1:10] # not_causal <- setdiff(colnames(x), causal) # beta <- rep(0, length = p) # beta[which(colnames(x) %in% causal)] <- rnorm(n = length(causal)) # mu <- as.numeric(x %*% beta) new_model(name = "penfam", label = sprintf("type = %s, eta = %s", type, eta), params = list(mu = mu, n = n, x = x, beta = beta, type = type, not_causal = not_causal, kin = kin, b0 = b0, sigma2 = sigma2, eta = eta, causal = causal), simulate = function(mu, sigma2, eta, kin, n, nsim) { P <- MASS::mvrnorm(nsim, mu = rep(0, n), Sigma = eta * sigma2 * kin) E <- MASS::mvrnorm(nsim, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n)) # y <- mu + sigma * matrix(rnorm(nsim * n), n, nsim) y <- b0 + mu + t(P) + t(E) return(split(y, col(y))) # make each col its own list element }) } make_mixed_model <- function(b0, eta, sigma2, type) { file_paths <- switch(type, causal_400 = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3600_causal_400.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3600_causal_400.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_400") }, causal_2k = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_2000_causal_2000.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_2000_causal_2000.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_2000") }, causal_4k = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_causal_4000.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_causal_4000.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_4000") }) kin <- as.matrix(read.table(file_paths$Phi)) kin_names <- data.table::fread(file_paths$Phi_names) dimnames(kin)[[1]] <- kin_names$V1 dimnames(kin)[[2]] <- kin_names$V1 isPD <- all(eigen(kin)$values > 0) how_many_neg_eigenvalues <- sum(eigen(kin)$values <= 0) if (!isPD) { kinPD <- as(nearPD(kin)$mat,"matrix") dimnames(kinPD)[[1]] <- kin_names$V1 dimnames(kinPD)[[2]] <- kin_names$V1 kin <- kinPD } dat <- snpStats::read.plink(file_paths$bedfile) x <- as(dat$genotypes, "numeric") np <- dim(x) n <- np[[1]] p <- np[[2]] # kin <- snpStats::xxt(dat$genotypes)/p causal <- data.table::fread(file_paths$causal_list, header = FALSE)$V1 not_causal <- setdiff(colnames(x), causal) beta <- rep(0, length = p) beta[which(colnames(x) %in% causal)] <- runif(n = length(causal), 0.1, 0.3) mu <- as.numeric(x %*% beta) new_model(name = "ggmix", label = sprintf("type = %s, eta = %s", type, eta), params = list(mu = mu, n = n, x = x, beta = beta, type = type, not_causal = not_causal, kin = kin, b0 = b0, sigma2 = sigma2, eta = eta, causal = causal), simulate = function(mu, sigma2, eta, kin, n, nsim) { P <- MASS::mvrnorm(nsim, mu = rep(0, n), Sigma = eta * sigma2 * kin) E <- MASS::mvrnorm(nsim, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n)) # y <- mu + sigma * matrix(rnorm(nsim * n), n, nsim) y <- b0 + mu + t(P) + t(E) return(split(y, col(y))) # make each col its own list element }) } make_mixed_model_not_simulator <- function(b0,betamean,eta, sigma2, percent_causal, percent_overlap) { if (percent_causal == 1) { file_paths <- switch(percent_overlap, `0` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_8k.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_8k.rel.id", #X_phi is bed files used to make kinship X_Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_other_8k", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_1k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_10") }, `100` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_7990_causal_10.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_7990_causal_10.rel.id", #X_phi is bed files used to make kinship X_Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_other_7990_causal_10", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_1k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_10") }) } else if (percent_causal == 2.5) { file_paths <- switch(percent_overlap, `0` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_4k.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_4k.rel.id", #X_phi is bed files used to make kinship X_Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_other_4k", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_100") }, `100` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_7900_causal_100.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_7900_causal_100.rel.id", #X_phi is bed files used to make kinship X_Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_other_7900_causal_100", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_100") }) } else if (percent_causal == 10) { file_paths <- switch(percent_overlap, `0` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_4k.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_4k.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_400") }, `50` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3800_causal_200.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3800_causal_200.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_400") }, `100` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3600_causal_400.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3600_causal_400.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_400") }) } else if (percent_causal == 50) { file_paths <- switch(percent_overlap, `0` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_4k.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_4k.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_2000") }, `50` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3000_causal_1000.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_3000_causal_1000.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_2000") }, `100` = { list(Phi = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_2000_causal_2000.rel", Phi_names = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/kinship/Kinship_other_2000_causal_2000.rel.id", bedfile = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/bed/X_test_4k", causal_list = "/mnt/GREENWOOD_BACKUP/home/sahir.bhatnagar/penfam/simulation/data/snplist/causal_SNP_from_X_test_2000") }) } # browser() # this uses plink kinship matrix # kin <- as.matrix(read.table(file_paths$Phi)) # kin_names <- data.table::fread(file_paths$Phi_names) # dimnames(kin)[[1]] <- kin_names$V1 # dimnames(kin)[[2]] <- kin_names$V1 # kin[1:5,1:5] # isPD <- all(eigen(kin)$values > 0) # how_many_neg_eigenvalues <- sum(eigen(kin)$values <= 0) # if (!isPD) { # kinPD <- as(nearPD(kin)$mat,"matrix") # dimnames(kinPD)[[1]] <- kin_names$V1 # dimnames(kinPD)[[2]] <- kin_names$V1 # kin <- kinPD # } # gaston kinship Phi <- gaston::read.bed.matrix(file_paths$X_Phi) kin <- gaston::GRM(Phi, autosome.only = FALSE) kin[1:5,1:5] all(complete.cases(kin)) eiK <- eigen(kin) # all(rownames(as.matrix(x))==rownames(kin)) # deal with a small negative eigen value any(eiK$values < 0) eiK$values[ eiK$values < 0 ] <- 0 PC <- sweep(eiK$vectors, 2, sqrt(eiK$values), "*") # dat <- snpStats::read.plink(file_paths$bedfile) # x <- as(dat$genotypes, "numeric") # np <- dim(x) # n <- np[[1]] # p <- np[[2]] # need to re-order # all(rownames(x)==rownames(kin)) # all(rownames(x) %in% rownames(kin)) # x <- x[match(rownames(kin), rownames(x)),] # all(rownames(x)==rownames(kin)) # # # # pcaCars <- princomp(kin, cor = TRUE) # eiK <- eigen(kin) # all(rownames(as.matrix(x))==rownames(kin)) # # deal with a small negative eigen value # any(eiK$values < 0) # eiK$values[ eiK$values < 0 ] <- 0 # PC <- sweep(eiK$vectors, 2, sqrt(eiK$values), "*") dat <- gaston::read.bed.matrix(file_paths$bedfile) gaston::standardize(dat) <- "p" X <- as.matrix(dat) any(is.na(X)) X[is.na(X)] <- 0 any(is.na(X)) X[1:5,1:5] # need to re-order all(rownames(X)==rownames(kin)) all(rownames(X) %in% rownames(kin)) X <- X[match(rownames(kin), rownames(X)),] all(rownames(X)==rownames(kin)) np <- dim(X) n <- np[[1]] p <- np[[2]] # # evv <- eigen(kin, symmetric=TRUE) # pcs <- evv$vectors[,1:10] # # plot(pcs[,3], pcaCars$loadings[,3]) # # view objects stored in pcaCars # names(pcaCars) # # proportion of variance explained # summary(pcaCars) # # scree plot # plot(pcaCars, type = "l") # plot(eiK$values) all(rownames(X)==rownames(kin)) x_lasso <- cbind(X,PC[,1:10]) x_lasso[1:5,1:5] # kin <- snpStats::xxt(dat$genotypes)/p causal <- data.table::fread(file_paths$causal_list, header = FALSE)$V1 not_causal <- setdiff(colnames(X), causal) beta <- rep(0, length = p) beta[which(colnames(X) %in% causal)] <- runif(n = length(causal), 0.9, 1.1) # beta[which(colnames(x) %in% causal)] <- rnorm(n = length(causal)) mu <- as.numeric(X %*% beta) # sum(complete.cases(X)) # dim(X) # length(beta) # kin <- as.matrix(read.table(file_paths$Phi))[1:500,1:500] # kin_names <- data.table::fread(file_paths$Phi_names) # dimnames(kin)[[1]] <- kin_names$V1[1:200] # dimnames(kin)[[2]] <- kin_names$V1[1:200] # # # browser() # # dat <- snpStats::read.plink(file_paths$bedfile) # x <- as(dat$genotypes, "numeric")[1:200,1:100] # np <- dim(x) # n <- np[[1]] # p <- np[[2]] # str(dat) # xst <- snpStats::snp.post.multiply(dat$genotypes, diag(p)) # dimnames(xst)[[2]] <- colnames(x) # xst[1:10,1:10] # causal <- colnames(x)[1:10] # not_causal <- setdiff(colnames(x), causal) # beta <- rep(0, length = p) # beta[which(colnames(x) %in% causal)] <- rnorm(n = length(causal)) # mu <- as.numeric(x %*% beta) P <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = eta * sigma2 * kin) E <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n)) # y <- mu + sigma * matrix(rnorm(nsim * n), n, nsim) # y <- b0 + mu + t(P) + t(E) y <- MASS::mvrnorm(1, mu = mu, Sigma = eta * sigma2 * kin + (1 - eta) * sigma2 * diag(n)) # y <- 0 + mu + P + E return(list(y = y, x = X, causal = causal, beta = beta, kin = kin, x_lasso = x_lasso, file_paths = file_paths)) # new_model(name = "penfam", label = sprintf("type = %s, eta = %s", type, eta), # params = list(mu = mu, n = n, x = x, beta = beta, type = type, not_causal = not_causal, # kin = kin, b0 = b0, sigma2 = sigma2, eta = eta, causal = causal), # simulate = function(mu, sigma2, eta, kin, n, nsim) { # P <- MASS::mvrnorm(nsim, mu = rep(0, n), Sigma = eta * sigma2 * kin) # E <- MASS::mvrnorm(nsim, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n)) # # y <- mu + sigma * matrix(rnorm(nsim * n), n, nsim) # y <- b0 + mu + t(P) + t(E) # return(split(y, col(y))) # make each col its own list element # }) } # this function is similar to the one used to simulate the data included in the ggmix package # see data-raw/bnpsd-data.R. I copied over this function to the R/utils.R file, because i modified it # a little bit so that it doenst plot anyhting. and also in case I remove it here. that function # will also be included in the package to simulate data make_ADmixed_model_not_simulator <- function(n, p_test, p_kinship, k, s, Fst, b0, beta_mean, eta, sigma2, geography = c("ind", "1d","circ"), percent_causal, percent_overlap) { # p_test: number of variables in X_test, i.e., the design matrix # p_kinship: number of variable in X_kinship, i.e., matrix used to calculate kinship # k: Number of intermediate subpopulations # s: The desired bias coefficient, which specifies σ indirectly. Required if sigma is missing # F: The length-k vector of inbreeding coefficients (or FST's) of the intermediate subpopulations, # up to a scaling factor (which cancels out in calculations). Required if sigma is missing # Fst: The desired final FST of the admixed individuals. Required if sigma is missing # browser() # define population structure FF <- 1:k # subpopulation FST vector, up to a scalar # s <- 0.5 # desired bias coefficient # Fst <- 0.1 # desired FST for the admixed individuals geography <- match.arg(geography) if (geography == "1d") { obj <- bnpsd::q1d(n = n, k = k, s = s, F = FF, Fst = Fst) Q <- obj$Q FF <- obj$F } else if (geography == "ind") { n1 <- 200; n2 <- 200; n3 <- 200; n4 <- 200; n5 <- 200 # here’s the labels (for simplicity, list all individuals of S1 first, then S2, then S3) labs <- c( rep.int("S1", n1), rep.int("S2", n2), rep.int("S3", n3), rep.int("S4", n4), rep.int("S5", n5)) # data dimensions infered from labs: length(labs) # number of individuals "n" # desired admixture matrix ("is" stands for "Independent Subpopulations") Q <- bnpsd::qis(labs) FF <- 1:k # subpopulation FST vector, unnormalized so far FF <- FF/popkin::fst(FF)*Fst # normalized to have the desired Fst } else if (geography == "circ") { obj <- bnpsd::q1dc(n = n, k = k, s = s, F = FF, Fst = Fst) Q <- obj$Q FF <- obj$F } ncausal <- p_test * percent_causal # browser() if (percent_overlap == "100") { total_snps_to_simulate <- p_test + p_kinship - ncausal # this contains all SNPs (X_{Testing}:X_{kinship}) out <- bnpsd::rbnpsd(Q, FF, total_snps_to_simulate) Xall <- t(out$X) # genotypes are columns, rows are subjects cnames <- paste0("X", 1:total_snps_to_simulate) colnames(Xall) <- cnames rownames(Xall) <- paste0("id", 1:n) Xall[1:5,1:5] dim(Xall) subpops <- ceiling( (1:n)/n*k ) table(subpops) # got k=10 subpops with 100 individuals each # Snps used for kinship snps_kinships <- sample(cnames, p_kinship, replace = FALSE) length(snps_kinships) # all causal snps are in kinship matrix causal <- sample(snps_kinships, ncausal, replace = FALSE) snps_design <- c(setdiff(cnames, snps_kinships), causal) # length(snps_design) # setdiff(cnames, snps_kinships) %>% length() not_causal <- setdiff(snps_design, causal) Xkinship <- Xall[,snps_kinships] Xtest <- Xall[,snps_design] # now estimate kinship using popkin # PhiHat <- popkin::popkin(X, subpops, lociOnCols = TRUE) PhiHat <- popkin::popkin(Xkinship, lociOnCols = TRUE) } else if (percent_overlap == "0") { total_snps_to_simulate <- p_test + p_kinship # this contains all SNPs (X_{Testing}:X_{kinship}) out <- bnpsd::rbnpsd(Q, FF, total_snps_to_simulate) Xall <- t(out$X) # genotypes are columns, rows are subjects cnames <- paste0("X", 1:total_snps_to_simulate) colnames(Xall) <- cnames rownames(Xall) <- paste0("id", 1:n) Xall[1:5,1:5] dim(Xall) subpops <- ceiling( (1:n)/n*k ) table(subpops) # got k=10 subpops with 100 individuals each # Snps used for kinship snps_kinships <- sample(cnames, p_kinship, replace = FALSE) length(snps_kinships) snps_design <- setdiff(cnames, snps_kinships) # length(snps_design) # setdiff(cnames, snps_kinships) %>% length() causal <- sample(snps_design, ncausal, replace = FALSE) not_causal <- setdiff(snps_design, causal) Xkinship <- Xall[,snps_kinships] Xtest <- Xall[,snps_design] # now estimate kinship using popkin # PhiHat <- popkin::popkin(X, subpops, lociOnCols = TRUE) PhiHat <- popkin::popkin(Xkinship, lociOnCols = TRUE) } kin <- 2 * PhiHat eiK <- eigen(kin) if (any(eiK$values < 1e-5)) { eiK$values[ eiK$values < 1e-5 ] <- 1e-5 } PC <- sweep(eiK$vectors, 2, sqrt(eiK$values), "*") plot(eiK$values) plot(PC[,1],PC[,2], pch = 19, col = rep(RColorBrewer::brewer.pal(5,"Paired"), each = 200)) np <- dim(Xtest) n <- np[[1]] p <- np[[2]] x_lasso <- cbind(Xtest,PC[,1:10]) x_lasso[1:5,1:5] # kin <- snpStats::xxt(dat$genotypes)/p beta <- rep(0, length = p) # beta[which(colnames(Xtest) %in% causal)] <- runif(n = length(causal), beta_mean - 0.1, beta_mean + 0.1) beta[which(colnames(Xtest) %in% causal)] <- rnorm(n = length(causal)) mu <- as.numeric(Xtest %*% beta) # browser() tt <- eta * sigma2 * kin if (!all(eigen(tt)$values > 0)) { message("eta * sigma2 * kin not PD, using Matrix::nearPD") tt <- Matrix::nearPD(tt)$mat } P <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = tt) E <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n)) # y <- mu + sigma * matrix(rnorm(nsim * n), n, nsim) # y <- b0 + mu + t(P) + t(E) # y <- MASS::mvrnorm(1, mu = mu, Sigma = eta * sigma2 * kin + (1 - eta) * sigma2 * diag(n)) y <- b0 + mu + P + E return(list(y = y, x = Xtest, causal = causal, beta = beta, kin = kin, not_causal = not_causal, x_lasso = x_lasso)) # new_model(name = "penfam", label = sprintf("type = %s, eta = %s", type, eta), # params = list(mu = mu, n = n, x = x, beta = beta, type = type, not_causal = not_causal, # kin = kin, b0 = b0, sigma2 = sigma2, eta = eta, causal = causal), # simulate = function(mu, sigma2, eta, kin, n, nsim) { # P <- MASS::mvrnorm(nsim, mu = rep(0, n), Sigma = eta * sigma2 * kin) # E <- MASS::mvrnorm(nsim, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n)) # # y <- mu + sigma * matrix(rnorm(nsim * n), n, nsim) # y <- b0 + mu + t(P) + t(E) # return(split(y, col(y))) # make each col its own list element # }) } make_ADmixed_model_not_simulator_with_validation <- function(n_train, p_test, p_kinship, k, s, Fst, b0, beta_mean, n_validation, eta, sigma2, geography = c("ind", "1d","circ"), percent_causal, percent_overlap) { # p_test: number of variables in X_test, i.e., the design matrix # p_kinship: number of variable in X_kinship, i.e., matrix used to calculate kinship # k: Number of intermediate subpopulations # s: The desired bias coefficient, which specifies σ indirectly. Required if sigma is missing # F: The length-k vector of inbreeding coefficients (or FST's) of the intermediate subpopulations, # up to a scaling factor (which cancels out in calculations). Required if sigma is missing # Fst: The desired final FST of the admixed individuals. Required if sigma is missing # browser() # define population structure # n_total <- n_train + n_validation # train_ind <- sample(1:n_total, n_train) FF <- 1:k # subpopulation FST vector, up to a scalar # s <- 0.5 # desired bias coefficient # Fst <- 0.1 # desired FST for the admixed individuals geography <- match.arg(geography) if (geography == "1d") { obj <- bnpsd::q1d(n = n_train, k = k, s = s, F = FF, Fst = Fst) Q <- obj$Q FF <- obj$F obj_val <- bnpsd::q1d(n = n_validation, k = k, s = s, F = FF, Fst = Fst) Q_val <- obj_val$Q FF_val <- obj_val$F } else if (geography == "ind") { n_train_k <- n_train / k n1 <- n_train_k; n2 <- n_train_k; n3 <- n_train_k; n4 <- n_train_k; n5 <- n_train_k # here’s the labels (for simplicity, list all individuals of S1 first, then S2, then S3) labs <- c( rep.int("S1", n1), rep.int("S2", n2), rep.int("S3", n3), rep.int("S4", n4), rep.int("S5", n5)) # data dimensions infered from labs: length(labs) # number of individuals "n" # desired admixture matrix ("is" stands for "Independent Subpopulations") Q <- bnpsd::qis(labs) FF <- 1:k # subpopulation FST vector, unnormalized so far FF <- FF/popkin::fst(FF)*Fst # normalized to have the desired Fst n_val_k <- n_validation / k n1_val <- n_val_k; n2_val <- n_val_k; n3_val <- n_val_k; n4_val <- n_val_k; n5_val <- n_val_k # here’s the labels (for simplicity, list all individuals of S1 first, then S2, then S3) labs_val <- c( rep.int("S1", n1_val), rep.int("S2", n2_val), rep.int("S3", n3_val), rep.int("S4", n4_val), rep.int("S5", n5_val)) Q_val <- bnpsd::qis(labs_val) FF_val <- 1:k # subpopulation FST vector, unnormalized so far FF_val <- FF_val/popkin::fst(FF_val)*Fst # normalized to have the desired Fst } else if (geography == "circ") { obj <- bnpsd::q1dc(n = n_train, k = k, s = s, F = FF, Fst = Fst) Q <- obj$Q FF <- obj$F obj_val <- bnpsd::q1dc(n = n_validation, k = k, s = s, F = FF, Fst = Fst) Q_val <- obj_val$Q FF_val <- obj_val$F # this will be the same as FF } ncausal <- p_test * percent_causal # browser() if (percent_overlap == "100") { # browser() total_snps_to_simulate <- p_test + p_kinship - ncausal # this contains all SNPs (X_{Testing}:X_{kinship}) pAnc <- bnpsd::rpanc(total_snps_to_simulate) # random vector of ancestral allele frequencies (length= number of loci) B <- bnpsd::rpint(pAnc, FF) # matrix of intermediate subpop allele freqs P <- bnpsd::rpiaf(B = B,Q = Q) out <- bnpsd::rgeno(P) Xall <- t(out) # genotypes are columns, rows are subjects cnames <- paste0("X", 1:total_snps_to_simulate) colnames(Xall) <- cnames rownames(Xall) <- paste0("id", 1:n_train) B_val <- bnpsd::rpint(pAnc, FF_val) # matrix of intermediate subpop allele freqs (FF is same as FF_val) # head(B_val);head(B) # sigma <- 1 # dispersion parameter of intermediate subpops # Q <- q1d(n, k, sigma) # non-trivial admixture proportions # head(obj$Q);head(obj_val$Q) # also the same, but would change depending on if n_train is different from n_validation # tail(obj$Q);tail(obj_val$Q) P_val <- bnpsd::rpiaf(B = B_val, Q = Q_val) # P_val[1:5,1:5] ; P[1:5,1:5] out_val <- bnpsd::rgeno(P_val) Xall_val <- t(out_val) # genotypes are columns, rows are subjects colnames(Xall_val) <- cnames rownames(Xall_val) <- paste0("id", 1:n_validation) # tra <- gaston::as.bed.matrix(Xall) # tra@p %>% plot # val <- gaston::as.bed.matrix(Xall_val) # dev.off() # these plots show the train and val MAFs are assez correler # plot(tra@p, val@p) # cor(tra@p, val@p) # subpops <- ceiling( (1:n)/n*k ) # table(subpops) # got k=10 subpops with 100 individuals each # Snps used for kinship snps_kinships <- sample(cnames, p_kinship, replace = FALSE) length(snps_kinships) # all causal snps are in kinship matrix causal <- sample(snps_kinships, ncausal, replace = FALSE) length(causal) snps_design <- c(setdiff(cnames, snps_kinships), causal) # length(snps_design) # setdiff(cnames, snps_kinships) %>% length() not_causal <- setdiff(snps_design, causal) Xkinship <- Xall[,snps_kinships] Xtest <- Xall[,snps_design] Xkinship_val <- Xall_val[,snps_kinships] Xtest_val <- Xall_val[,snps_design] # now estimate kinship using popkin # PhiHat <- popkin::popkin(X, subpops, lociOnCols = TRUE) PhiHat <- popkin::popkin(Xkinship, lociOnCols = TRUE) PhiHat_val <- popkin::popkin(Xkinship_val, lociOnCols = TRUE) # popkin::plotPopkin(list(PhiHat,PhiHat_val)) } else if (percent_overlap == "0") { total_snps_to_simulate <- p_test + p_kinship # this contains all SNPs (X_{Testing}:X_{kinship}) out <- bnpsd::rbnpsd(Q, FF, total_snps_to_simulate) Xall <- t(out$X) # genotypes are columns, rows are subjects cnames <- paste0("X", 1:total_snps_to_simulate) colnames(Xall) <- cnames rownames(Xall) <- paste0("id", 1:n) Xall[1:5,1:5] dim(Xall) subpops <- ceiling( (1:n)/n*k ) table(subpops) # got k=10 subpops with 100 individuals each # Snps used for kinship snps_kinships <- sample(cnames, p_kinship, replace = FALSE) length(snps_kinships) snps_design <- setdiff(cnames, snps_kinships) # length(snps_design) # setdiff(cnames, snps_kinships) %>% length() causal <- sample(snps_design, ncausal, replace = FALSE) not_causal <- setdiff(snps_design, causal) Xkinship <- Xall[,snps_kinships] Xtest <- Xall[,snps_design] # now estimate kinship using popkin # PhiHat <- popkin::popkin(X, subpops, lociOnCols = TRUE) PhiHat <- popkin::popkin(Xkinship, lociOnCols = TRUE) } kin <- 2 * PhiHat eiK <- eigen(kin) if (any(eiK$values < 1e-5)) { eiK$values[ eiK$values < 1e-5 ] <- 1e-5 } PC <- sweep(eiK$vectors, 2, sqrt(eiK$values), "*") # plot(eiK$values) # plot(PC[,1],PC[,2], pch = 19, col = rep(RColorBrewer::brewer.pal(5,"Paired"), each = 200)) kin_val <- 2 * PhiHat_val eiK_val <- eigen(kin_val) if (any(eiK_val$values < 1e-5)) { eiK_val$values[ eiK_val$values < 1e-5 ] <- 1e-5 } PC_val <- sweep(eiK_val$vectors, 2, sqrt(eiK_val$values), "*") # plot(eiK_val$values) par(mfrow = c(1,2)) plot(PC[,1],PC[,2], pch = 19, col = rep(RColorBrewer::brewer.pal(5,"Paired"), each = 200), main = "training set") plot(PC_val[,1],PC_val[,2], pch = 19, col = rep(RColorBrewer::brewer.pal(5,"Paired"), each = 200), main = "test set") np <- dim(Xtest) n <- np[[1]] p <- np[[2]] x_lasso <- cbind(Xtest,PC[,1:10]) x_lasso[1:5,1:5] # kin <- snpStats::xxt(dat$genotypes)/p beta <- rep(0, length = p) # beta_val <- rep(0, length = p) # beta[which(colnames(Xtest) %in% causal)] <- runif(n = length(causal), beta_mean - 0.1, beta_mean + 0.1) beta[which(colnames(Xtest) %in% causal)] <- rnorm(n = length(causal)) # beta_val[which(colnames(Xtest) %in% causal)] <- rnorm(n = length(causal)) mu <- as.numeric(Xtest %*% beta) mu_val <- as.numeric(Xtest_val %*% beta) P <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = eta * sigma2 * kin) E <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n)) y <- b0 + mu + P + E P <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = eta * sigma2 * kin_val) E <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n)) y_val <- b0 + mu_val + P + E return(list(y = y, x = Xtest, causal = causal, beta = beta, kin = kin, y_val = y_val, x_val = Xtest_val, not_causal = not_causal, x_lasso = x_lasso)) # new_model(name = "penfam", label = sprintf("type = %s, eta = %s", type, eta), # params = list(mu = mu, n = n, x = x, beta = beta, type = type, not_causal = not_causal, # kin = kin, b0 = b0, sigma2 = sigma2, eta = eta, causal = causal), # simulate = function(mu, sigma2, eta, kin, n, nsim) { # P <- MASS::mvrnorm(nsim, mu = rep(0, n), Sigma = eta * sigma2 * kin) # E <- MASS::mvrnorm(nsim, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n)) # # y <- mu + sigma * matrix(rnorm(nsim * n), n, nsim) # y <- b0 + mu + t(P) + t(E) # return(split(y, col(y))) # make each col its own list element # }) } make_INDmixed_model_not_simulator <- function(n, p, ncausal, k, s, Fst, b0, beta_mean, eta, sigma2) { # k: Number of intermediate subpopulations # s: The desired bias coefficient, which specifies σ indirectly. Required if sigma is missing # F: The length-k vector of inbreeding coefficients (or FST's) of the intermediate subpopulations, # up to a scaling factor (which cancels out in calculations). Required if sigma is missing # Fst: The desired final FST of the admixed individuals. Required if sigma is missing # browser() # define population structure n1 <- 200; n2 <- 200; n3 <- 200; n4 <- 200; n5 <- 200 # here’s the labels (for simplicity, list all individuals of S1 first, then S2, then S3) labs <- c( rep.int("S1", n1), rep.int("S2", n2), rep.int("S3", n3), rep.int("S4", n4), rep.int("S5", n5)) # data dimensions infered from labs: length(labs) # number of individuals "n" # desired admixture matrix ("is" stands for "Independent Subpopulations") Q <- bnpsd::qis(labs) FF <- 1:k # subpopulation FST vector, unnormalized so far FF <- FF/popkin::fst(FF)*Fst # normalized to have the desired Fst # s <- 0.5 # desired bias coefficient # Fst <- 0.1 # desired FST for the admixed individuals # obj <- bnpsd::q1d(n = n, k = k, s = s, F = FF, Fst = Fst) # admixture proportions from 1D geography # Q <- obj$Q # FF <- obj$F out <- bnpsd::rbnpsd(Q, FF, p) X <- t(out$X) # genotypes are columns, rows are subjects dim(X) colnames(X) <- paste0("X",1:ncol(X)) rownames(X) <- paste0("id",1:nrow(X)) dim(X) X[1:5,1:5] subpops <- ceiling( (1:n)/n*k ) table(subpops) # got k=10 subpops with 100 individuals each # now estimate kinship using popkin # PhiHat <- popkin::popkin(X, subpops = c(rep(1, n1), rep(2, n2), rep(3, n3), rep(4, n4), rep(5, n5)), lociOnCols = TRUE) PhiHat <- popkin::popkin(X, lociOnCols = TRUE) # PhiHat[1:5,1:5] kin <- 2 * PhiHat # kin <- PhiHat # kin <- gaston::GRM(gaston::as.bed.matrix(X), autosome.only = FALSE) # inbrDiag(PhiHat) # dev.off() # plotPopkin(list(PhiHat, PhiHat2)) isPD <- all(eigen(kin)$values > 0) how_many_neg_eigenvalues <- sum(eigen(kin)$values <= 0) if (!isPD) { kinPD <- as(nearPD(kin)$mat,"matrix") kin <- kinPD } kin[1:5,1:5] dim(PhiHat) eiK <- eigen(kin) # all(rownames(as.matrix(x))==rownames(kin)) # deal with a small negative eigen value plot(eiK$values) sum(eiK$values < 0) if (any(eiK$values < 0)) { eiK$values[ eiK$values < 0 ] <- 0 } PC <- sweep(eiK$vectors, 2, sqrt(eiK$values), "*") # dev.off() plot(eiK$values) plot(PC[,1],PC[,2], pch = 19, col = rep(RColorBrewer::brewer.pal(5,"Paired"), each = n1)) # X <- t(out$X) # dim(X) #Phi;Phi_names;bedfile;causal_list # browser() # need to re-order # all(rownames(X)==rownames(kin)) # all(rownames(X) %in% rownames(kin)) # X <- X[match(rownames(kin), rownames(X)),] # all(rownames(X)==rownames(kin)) np <- dim(X) n <- np[[1]] p <- np[[2]] x_lasso <- cbind(X,PC[,1:10]) x_lasso[1:5,1:5] # kin <- snpStats::xxt(dat$genotypes)/p causal <- sample(colnames(X), ncausal, replace = FALSE) not_causal <- setdiff(colnames(X), causal) beta <- rep(0, length = p) beta[which(colnames(X) %in% causal)] <- runif(n = length(causal), beta_mean - 0.2, beta_mean + 0.2) # beta[which(colnames(x) %in% causal)] <- rnorm(n = length(causal)) mu <- as.numeric(X %*% beta) P <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = eta * sigma2 * kin) E <- MASS::mvrnorm(1, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n)) # P <- MASS::mvrnorm(1, rep(0,n), 1.2625 * kin) # y <- mu + sigma * matrix(rnorm(nsim * n), n, nsim) # y <- b0 + mu + t(P) + t(E) # y <- MASS::mvrnorm(1, mu = mu, Sigma = eta * sigma2 * kin + (1 - eta) * sigma2 * diag(n)) # y <- mu + P + rnorm(n, 0, 1) y <- 0 + mu + P + E return(list(y = y, x = X, causal = causal, beta = beta, kin = kin, not_causal = not_causal, x_lasso = x_lasso)) # new_model(name = "penfam", label = sprintf("type = %s, eta = %s", type, eta), # params = list(mu = mu, n = n, x = x, beta = beta, type = type, not_causal = not_causal, # kin = kin, b0 = b0, sigma2 = sigma2, eta = eta, causal = causal), # simulate = function(mu, sigma2, eta, kin, n, nsim) { # P <- MASS::mvrnorm(nsim, mu = rep(0, n), Sigma = eta * sigma2 * kin) # E <- MASS::mvrnorm(nsim, mu = rep(0, n), Sigma = (1 - eta) * sigma2 * diag(n)) # # y <- mu + sigma * matrix(rnorm(nsim * n), n, nsim) # y <- b0 + mu + t(P) + t(E) # return(split(y, col(y))) # make each col its own list element # }) } # not used gaston testing code -------------------------------------------- # gaston_mat # head(gaston_mat@ped) # # ld.x <- gaston::LD(gaston_mat, c(1,ncol(gaston_mat))) # standardize(gaston_mat) # # plot(gaston_mat@p, gaston_mat@sigma, xlim=c(0,1)) # t <- seq(0,1,length=101); # lines(t, sqrt(2*t*(1-t)), col="red") # # y <- gaston::LD.thin(gaston_mat, threshold = 0.4, max.dist = 500e3) # y # # gaston_mat <- set.stats(gaston_mat) # head(gaston_mat@ped) # head(gaston_mat@snps) # # # gaston_mat@snps[1:5,1:100] # # gaston_mat <- gaston::read.bed.matrix(file_paths$bedfile_for_kinship) # grm_gaston <- gaston::GRM(gaston_mat) # # standardize(gaston_mat) <- "p" # # head(grm_gaston) # # grm_gaston[1:5,1:5] # X <- as.matrix(gaston_mat) # head(gaston_mat@bed) # any(is.na(gaston_mat@p)) # X[1:5,1:5] # X[is.na(X)] <- 0 # X[1:5,1:5] # colMeans(X) %>% round(100) %>% plot # apply(X, 2, sd) %>% plot # sqrt(2*gaston_mat@p*(1-gaston_mat@p)) %>% plot # # col(X) %>% round(100) %>% plot # scale(X)[1:5,1:5] # scale(X, center = 2*gaston_mat@p,scale = sqrt(2*gaston_mat@p*(1-gaston_mat@p)))[1:5,1:5] # # x <- as.bed.matrix(TTN.gen, TTN.fam, TTN.bim) # X <- as.matrix(x) # X[1:5,1:4] # standardize(x) <- "p" # as.matrix(x)[1:5, 1:4] # colMeans(as.matrix(x)) %>% round(10) %>% plot # apply(as.matrix(x), 2, sd) %>% plot # pheatmap::pheatmap(grm_gaston, show_rownames = F, show_colnames = F, color = rev(viridis::viridis(100))) # # # eiK <- eigen(grm_gaston) # any(eiK$values < 0) # eiK$values[ eiK$values < 0 ] <- 0 # plot(eiK$vectors[,1], eiK$vectors[,2]) # # PC <- sweep(eiK$vectors, 2, sqrt(eiK$values), "*") # plot(PC[,1],PC[,2]) # plot(PC[,2],PC[,3]) # # all(eigen(grm_gaston)$values > 0) # eigen(grm_gaston)$values[which(eigen(grm_gaston)$values < 0)] # # svd()