{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Allele frequencies in 1000 Genomes Phase 3\n", "\n", "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20130502/\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import os\n", "import gzip\n", "import csv\n", "\n", "import pandas\n", "import vcf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Options and file locations" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr5.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr6.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr7.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr8.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr9.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr10.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr11.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr12.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr13.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr15.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr16.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr17.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr18.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr19.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz',\n", " 'ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz',\n", " 'ALL.chrY.phase3_integrated_v1b.20130502.genotypes.vcf.gz']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Compute vcf file names\n", "vcf_files = [\n", " 'ALL.chr{}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'.format(x)\n", " for x in range(1, 23)]\n", "vcf_files += [\n", " 'ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz',\n", " 'ALL.chrY.phase3_integrated_v1b.20130502.genotypes.vcf.gz',\n", "]\n", "vcf_files" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "binary = '/home/dhimmels/bin/bin/bcftools'\n", "download_dir = 'download'\n", "subset_dir = 'data/common-snps/vcf/'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Maximum major allele frequency to allow\n", "major_af_max = 0.95" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Created subsetted vcf files using `bcftools`" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Warning: The index file is older than the data file: download/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr5.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr6.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr7.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr8.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr9.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr10.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr11.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr12.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr13.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr15.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr16.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr17.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr18.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr19.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz.tbi\n", "Warning: The index file is older than the data file: download/ALL.chrY.phase3_integrated_v1b.20130502.genotypes.vcf.gz.tbi\n" ] } ], "source": [ "for vcf_file in vcf_files:\n", " ! {binary} view --exclude-types indels,mnps,other \\\n", " --apply-filters PASS --max-af {major_af_max}:'major' \\\n", " --drop-genotypes --output-type z \\\n", " --output-file {subset_dir}/{vcf_file} {download_dir}/{vcf_file}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Combine subsetted vcf variants into a tsv" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def get_majar_af(aafs):\n", " \"\"\"\n", " Returns the major allele frequency from\n", " the list of alternate allele frequencies.\n", " \"\"\"\n", " allele_freqs = aafs + [1 - sum(aafs)]\n", " return max(allele_freqs)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "write_file = gzip.open('data/common-snps/common-snps.tsv.gz', 'wt')\n", "writer = csv.writer(write_file, delimiter='\\t')\n", "writer.writerow(('chromosome', 'position', 'major_allele_frequency', 'rsid'))\n", "for vcf_file in vcf_files:\n", " path = os.path.join(subset_dir, vcf_file)\n", " \n", " for r in vcf.Reader(filename=path):\n", " # Quality control\n", " if r.FILTER:\n", " print(r.FILTER)\n", " continue\n", "\n", " # Exclude non-SNPs\n", " if not r.is_snp:\n", " print(r.var_type)\n", " continue\n", "\n", " # Major allele frequency check\n", " major_af = get_majar_af(r.INFO['AF'])\n", " if major_af > major_af_max:\n", " print(major_af)\n", " continue\n", " \n", " # Write SNP to tsv\n", " row = r.CHROM, r.POS, round(major_af, 6), r.ID\n", " writer.writerow(row)\n", "\n", "write_file.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert to a bed file" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chromchromStartchromEnd
0chr11100711008
1chr11101111012
\n", "
" ], "text/plain": [ " chrom chromStart chromEnd\n", "0 chr1 11007 11008\n", "1 chr1 11011 11012" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bed_df = pandas.read_table('data/common-snps/common-snps.tsv.gz', low_memory=False)\n", "bed_df['chrom'] = 'chr' + bed_df['chromosome']\n", "bed_df['chromStart'] = bed_df['position'] - 1\n", "bed_df['chromEnd'] = bed_df['position']\n", "bed_df = bed_df[['chrom', 'chromStart', 'chromEnd']]\n", "bed_df.head(2)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "with gzip.open('data/common-snps/common-snps.bed.gz', 'wt') as write_file:\n", " bed_df.to_csv(write_file, index=False, sep='\\t', header=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" } }, "nbformat": 4, "nbformat_minor": 0 }