# This script uses the FLARE .panels output to build clusters, estimate model parameters, and output a FLARE model file. # The model file includes a comment line with the lagged cross-correlations. # This script also outputs a log file that includes some input data summaries. # Usage: python3 create_model_file.py n_ancestries panels_filename output_prefix # "n_ancestries" is a positive integer giving the number of clusters (ancestries) to fit to the data # "panels_filename" is the name of a file that holds panel probabilities output by FLARE # "output_prefix" is a string, and the output files will be out_prefix.model and out_prefix.log import sys from sklearn.mixture import GaussianMixture import numpy as np nanc = int(sys.argv[1]) panelfile = open(sys.argv[2]) outprefix = sys.argv[3] logfile = open(outprefix+".log","w") outfile = open(outprefix+".model","w") T=10 # this is just for the model file, and is an initialization for flare print("input:",sys.argv[3],file=logfile) # use header of panels file to get nrefhaps and refpops nrefhaps = int(panelfile.readline().split('=')[1]) # total number of reference haplotypes across all panels that are being used refpops = panelfile.readline().split()[4:] npanel = len(refpops) # read in the panels file panelprobs = [] # first dimension is location rhovals = [] # first dimension is location panelprobschrom = [] rhovalschrom = [] oldlocindex = -1 hapindices = [] oldchrom = -1 for line in panelfile: if line[0]=='#': continue bits = line.split() locindex = int(bits[0]); chrom = bits[1].split(':')[0]; hapindex = int(bits[2]); thisrho = float(bits[3]) if chrom != oldchrom: oldchrom = chrom if len(panelprobschrom)>0: panelprobs.extend(panelprobschrom) rhovals.extend(rhovalschrom) panelprobschrom = [] rhovalschrom = [] if locindex != oldlocindex: oldlocindex = locindex panelprobschrom.append([]) # second dimension is haplotype rhovalschrom.append([]) # second dimension is haplotype if len(hapindices) > 0: if any([x!=y for x,y in zip(hapindices,oldhapindices)]): print("haplotype indicies inconsistent across positions",file=sys.stderr) raise SystemExit oldhapindices = [x for x in hapindices] hapindices = [] hapindices.append(hapindex) probs = [float(x) for x in bits[4:]] if len(probs)!=npanel: print("number of panels varies",file=sys.stderr) raise SystemExit panelprobschrom[len(panelprobschrom)-1].append(probs) # third dimension is panel rhovalschrom[len(panelprobschrom)-1].append(thisrho) # thisrho is a scalar panelprobs.extend(panelprobschrom) rhovals.extend(rhovalschrom) nloc = len(panelprobs) # summary of panels input printed to log file print(nloc,"locations,",len(hapindices),"haplotypes,",npanel,"reference panels:",' '.join(refpops),",",nrefhaps,"reference haplotypes",file=logfile) # do the clustering, including a transformation to avoid having entries sum to 1 def cluster(panelprobs,nanc): nhaps = len(panelprobs[0]) # reshape for clustering # combine the locations and haplotypes into a single dimension # will first have (trimmed) location 1 for all haps first, etc clustinput = np.array([hap for location in panelprobs for hap in location]) # drop one column of clustinput, which is the one with the highest sum sums = np.sum(clustinput,0) whichdrop = np.argmax(sums) transformed = np.delete(clustinput,whichdrop,1) # do the clustering/prediction myclust = GaussianMixture(nanc,init_params='k-means++',n_init=10) myclust.fit(transformed) labels = myclust.predict(transformed) # reshape results back to have the location dimension (location first dimension and haps second dimension) labelmatrix = [labels[i:i+nhaps] for i in range(0,len(labels),nhaps)] # put the removed column back in to the parameter (P) matrix # rows are ancestries, columns are ref panels extra_column = [1.0-sum(x) for x in myclust.means_] param = np.insert(myclust.means_,whichdrop,extra_column,axis=1) return (labelmatrix, param) # compute rho (switch rates) def make_rho(label_matrix,rho_matrix,nanc): # convert label matrix to vector matching the rho vector labels = [hap for location in label_matrix for hap in location] rho_vec = [hap for location in rho_matrix for hap in location] if len(labels)!=len(rho_vec): print("trouble in make_rho",len(labels),len(rho_vec),file=sys.stderr) raise SystemExit rho = [] for i in range(nanc): rhoi_vec = [rho_vec[index] for index in range(len(labels)) if labels[index]==i] rhoi = sum(rhoi_vec)/len(rhoi_vec) rho.append(rhoi) return rho # calculate correlations at lag # labelmatrix has location in first dim, haplotype in second dim def autoc(labelmatrix,nanc,lag): acs=dict() maxcrosscor= -1.0 minautocor=1.0 if len(labelmatrix) maxcrosscor: maxcrosscor = ac if i1==i2 and ac < minautocor: minautocor = ac return acs,maxcrosscor,minautocor thislabelmatrix,thisparams=cluster(panelprobs,nanc) thisrho = make_rho(thislabelmatrix,rhovals,nanc) # print out the model file print("# model is output from cluster_model_singlewin.py",file=outfile) lag=1 acs,maxcrosscor,minautocor = autoc(thislabelmatrix,nanc,lag) print("# lag-",lag,"-window correlations are ",acs,file=outfile,sep="") print("# max cross-correlation is",maxcrosscor,file=outfile) print("# min auto-correlation is",minautocor,file=outfile) anclab="anc" print("# list of ancestries",file=outfile) pops="" for i in range(nanc) : pops += anclab+"_"+str(i) if i < nanc-1: pops +="\t" print(pops,file=outfile) print(file=outfile) print("# list of reference panels",file=outfile) print("\t".join(refpops),file=outfile) print(file=outfile) print("# T: number of generations since admixture",file=outfile) print(T,file=outfile) print(file=outfile) print("# mu[i]: proportion of target genotypes with ancestry i",file=outfile) print("\t".join([str(1.0/nanc)]*nanc),file=outfile) print(file=outfile) print("# p[i][j]: probability that a model state haplotype is in reference panel j",file=outfile) print("# when the model state ancestry is i",file=outfile) for Prow in thisparams: print("\t".join([str(x) for x in Prow]),file=outfile) print(file=outfile) print("# theta[i][j]: probability that a model state haplotype and the target",file=outfile) print("# haplotype carry different alleles when the model state haplotype",file=outfile) print("# is in reference panel j and the model state ancestry is i",file=outfile) mylambda=1/(np.log(nrefhaps)+0.5) theta=mylambda/(2*mylambda+2*nrefhaps) for i in range(nanc): print("\t".join([str(theta)]*npanel),file=outfile) print(file=outfile) print("# rho[i]: rate of the exponential IBD segment cM-length distribution when the",file=outfile) print("# most recent common ancestor precedes admixture and has ancestry i",file=outfile) print("\t".join([str(x) for x in thisrho]),file=outfile)