#!/usr/bin/env python3
"""
vcf2plink.py - converts a VCF file to plink format
Copyright (C) 2016 Giulio Genovese
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
Written by Giulio Genovese
"""
import argparse, os, sys
from subprocess import call, Popen, PIPE
parser = argparse.ArgumentParser(description = 'vcf2plink.py: converts a VCF file to plink format (Oct 3rd 2016)', add_help = False, usage = 'vcf2plink.py [options]')
parser.add_argument('--vcf', metavar = '', type = str, default = '-', help = 'Specify a VCF file to be converted')
parser.add_argument('--ref', metavar = '', type = str, required = True, help = 'reference sequence (e.g. human_g1k_v37.fasta)')
parser.add_argument('--filter', metavar = '', type = str, help = 'exclude sites for which the expression is true (e.g. "FORMAT/DP<10 || FORMAT/GQ<20") (see bcftools manual for details)')
parser.add_argument('--set-GTs', metavar = '', type = str, default = '.', help = 'set genotypes of failed samples to missing (.) or ref (0)')
parser.add_argument('--out', metavar = '[prefix]', type = str, default = 'plink', help = 'Specify prefix for output files')
parser.add_argument('--mem', metavar = '', type = int, help = 'main workspace size, in GB')
parser.add_argument('--impute-sex', metavar = '{female max F} {male min F}', type = float, nargs = '*', help = 'Impute sex of samples from X chromosome inbreeding coefficients [.4 .4]')
parser.add_argument('--pdf', metavar = '[filename]', type = str, help = 'Generate F statistic histogram for sex imputation')
try:
parser.error = parser.exit
args = parser.parse_args()
if args.impute_sex == []:
args.impute_sex = [.4, .4]
except SystemExit:
parser.print_help()
exit(2)
# make sure the ouput directory exists
outdir = os.path.dirname(args.out)
if outdir != '' and not os.path.isdir(outdir):
os.makedirs(outdir)
# invoke bcftools to preprocess VCF file
sys.stderr.write('vcf2plink.py: Convert VCF file to plink\n')
if args.vcf == '-':
p1 = Popen(['bcftools', 'norm', '-Ou', '-m', '-any'], stdin = sys.stdin.buffer, stdout = PIPE)
else:
p1 = Popen(['bcftools', 'norm', '-Ou', '-m', '-any', args.vcf], stdout = PIPE)
p2 = Popen(['bcftools', 'norm', '-Ou', '-f', args.ref], stdin = p1.stdout, stdout = PIPE)
if args.filter:
p2a = Popen(['bcftools', 'filter', '-Ou', '-e', args.filter, '--set-GTs', args.set_GTs], stdin = p2.stdout, stdout = PIPE)
p3 = Popen(['bcftools', 'annotate', '-Ob', '-x', 'ID', '-I', '+%CHROM:%POS:%REF:%ALT'], stdin = p2a.stdout if args.filter else p2.stdout, stdout = PIPE)
# invoke plink to convert VCF file
if call(['plink', '--make-bed',
'--keep-allele-order',
'--bcf', '/dev/stdin',
'--vcf-idspace-to', '_',
'--const-fid',
'--allow-extra-chr', '0',
'--split-x', 'b37', 'no-fail',
'--out', args.out] + (['--memory', str(1024 * args.mem)] if args.mem else []), stdin = p3.stdout):
raise Exception('vcf2plink.py: Problems converting VCF file to plink')
# impute sex using chromosome X calls
if args.impute_sex:
if call(['plink', '--make-bed',
'--bfile', args.out,
'--keep-allele-order',
'--impute-sex', 'ycount', str(args.impute_sex[0]), str(args.impute_sex[1]), '10000', '0',
'--out', args.out]):
raise Exception('vcf2plink.py: Problems imputing sex')
if call(['rm', args.out + '.bed~', args.out + '.bim~', args.out + '.fam~']):
raise Exception('vcf2plink.py: Problems removing temporary files')
if args.pdf:
import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages
df = pd.read_csv(args.out + '.sexcheck', delim_whitespace = True)
with PdfPages(args.pdf) as pdf:
ax = df.plot(kind = 'scatter', x = 'F', y = 'YCOUNT', alpha = 1/2)
ax.set_xlabel('Chromosome X inbreeding coefficient (F)')
ax.set_ylabel('Chromosome Y non-missing genotypes (YCOUNT)')
pdf.savefig(ax.get_figure())