# usage: zcat genotypes.vcf.gz | python3 find_gcs.py myibd.ibdclust.gz minmaf > outfile # genotypes.vcf.gz is a VCF file of sequence data for one chromosome # myibd.ibdclust.gz is the output from running ibdcluster.jar on that same VCF file # minmaf is the minimum minor allele frequency, for example 0.1. It is recommended to use the same minor allele frequency for ibdcluster and this program # outfile (stdout) has one line per detected allele conversion # each such line has position (bp), size of subcluster 1, size of subcluster 2, identities of haplotypes in subcluster 1 and then those in subcluster 2. # each haplotype identity has individual index (zero-based) : haplotype index (0 or 1) import sys, gzip, time from collections import Counter omibdfile = gzip.open(sys.argv[1],mode='rt') minaf = float(sys.argv[2]) # minimum allele frequency, in order to properly handle multiallelics maxmaf = 1.0 minclustsize = 4 minpartsize = 2 deletion = 1 ids = omibdfile.readline().split()[3:] for vline in sys.stdin: if vline[1]=="#": continue vids = vline.split()[9:] break vindex = [vids.index(thisid)+9 for thisid in ids] nindiv = len(vindex) n = 2*nindiv # number of haplotypes acclustsizes = Counter() # checks whether the individual that carries hapindex is homozygous def is_homozygous(valleles,hapindex): if hapindex < nindiv: otherindex = hapindex + nindiv else: otherindex = hapindex - nindiv return valleles[hapindex]==valleles[otherindex] # checks whether the (heterozygous) individual's other cluster is mixed (so could be fixed with a rephasing) def is_correctable(oclust,counts,hapindex): if hapindex < nindiv: otherindex = hapindex + nindiv else: otherindex = hapindex - nindiv otherclust = oclust[otherindex] thiscounts = counts[otherclust] return sum(thiscounts)!=max(thiscounts) # returns true if this is a mixed cluster # print out the gc events for clusters that are mixed at this locus def print_gcs(valleles,oclust,counts,pos,clustsize,ndel,acclustsizes): nclust = len(oclust) for j,x,z in zip(range(nclust),counts,clustsize): if z < minclustsize: continue # cluster too small to be mixed doubleplus = [y>= minpartsize for y in x] if sum(doubleplus) <= 1: continue if sum(doubleplus) > 2: print("more than two alleles in cluster at", pos, "printing first two",file=sys.stderr) nhaps = len(oclust) nindiv = nhaps/2 hapindices = [i for i in range(nhaps) if oclust[i]==j] homozygous = [is_homozygous(valleles,x) for x in hapindices] if deletion and all(homozygous): ndel[0] += 1; continue whichalleles = [i for i in range(len(x)) if x[i]>1] which0 = [i for i in range(len(hapindices)) if valleles[hapindices[i]] == whichalleles[0] ] which1 = [i for i in range(len(hapindices)) if valleles[hapindices[i]] == whichalleles[1] ] hapindices0 = [hapindices[i] for i in which0] hapindices1 = [hapindices[i] for i in which1] print(pos, len(hapindices0), len(hapindices1),end=' ') for i in hapindices0 + hapindices1: print(':'.join([str(int(i % nindiv)), str(int(i/nindiv))]),end=' ') print() acclustsizes.update([len(hapindices)]) oprev = omibdfile.readline().split() nhaps = (len(oprev) - 3)*2 onew = oprev ndel = [0]; # counting how many times we invoke deletion hypothesis for vline in sys.stdin: if vline[0]=='#': continue #starttime = time.time() vbits = vline.split() pos = int(vbits[1]) # find the closest positions in the multiIBD files while pos > int(onew[1]): oprev = onew onew = omibdfile.readline().split() # deal with end of file case if len(onew)==0: onew = oprev break othis = onew if abs(int(onew[1])-pos) maxmaf: continue if maf < minaf: continue oclust = [int(x.split('|')[0]) for x in othis[3:]] + [int(x.split('|')[1]) for x in othis[3:]] if len(valleles)!=nhaps: print("files don't match in number of individuals",file=sys.stderr) maxclustplus = max(oclust)+1 counts = [[0]*maxalleleplus for j in range(maxclustplus)] clustsize = [0]*maxclustplus for i in range(nhaps): counts[oclust[i]][valleles[i]]+=1 clustsize[oclust[i]]+=1 # if truly multiallelic (3 or more alleles with allele count > 1) then remove any alleles with maf < minaf if nalleles>2: if sum([x>0 for x in allelefreqs])>2: for i in range(maxalleleplus): if allelefreqs[i] < minaf: for x in counts: x[i] = 0 print_gcs(valleles,oclust,counts,pos,clustsize,ndel,acclustsizes)