{ "cells": [ { "cell_type": "code", "source": [ "#!/usr/bin/env python3\n", "import os\n", "import re\n", "import sys\n", "import collections\n", "import argparse\n", "#import tables\n", "import itertools\n", "import matplotlib\n", "import glob\n", "import math\n", "%matplotlib inline\n", "\n", "import matplotlib.pyplot as plt\n", "import matplotlib.gridspec as gridspec\n", "import numpy as np\n", "import pandas as pd\n", "import scipy.stats as stats\n", "import scipy.sparse as sp_sparse\n", "\n", "from multiprocessing import Pool\n", "from collections import defaultdict\n", "from scipy import sparse, io\n", "from scipy.sparse import csr_matrix\n", "from multiprocessing import Pool\n", "#from matplotlib_venn import venn2, venn2_circles\n", "matplotlib.rcParams['pdf.fonttype'] = 42\n", "matplotlib.rcParams['ps.fonttype'] = 42" ], "outputs": [], "execution_count": 1, "metadata": {} }, { "cell_type": "code", "source": [ "GWAS_df = pd.read_csv('./gwas_catalog_v1.0.2-associations_e95_r2019-03-01.tsv', sep='\\t', header=0, low_memory=False)" ], "outputs": [], "execution_count": 2, "metadata": {} }, { "cell_type": "code", "source": [ "GWAS_df.columns" ], "outputs": [ { "output_type": "execute_result", "execution_count": 3, "data": { "text/plain": [ "Index(['DATE ADDED TO CATALOG', 'PUBMEDID', 'FIRST AUTHOR', 'DATE', 'JOURNAL',\n", " 'LINK', 'STUDY', 'DISEASE/TRAIT', 'INITIAL SAMPLE SIZE',\n", " 'REPLICATION SAMPLE SIZE', 'REGION', 'CHR_ID', 'CHR_POS',\n", " 'REPORTED GENE(S)', 'MAPPED_GENE', 'UPSTREAM_GENE_ID',\n", " 'DOWNSTREAM_GENE_ID', 'SNP_GENE_IDS', 'UPSTREAM_GENE_DISTANCE',\n", " 'DOWNSTREAM_GENE_DISTANCE', 'STRONGEST SNP-RISK ALLELE', 'SNPS',\n", " 'MERGED', 'SNP_ID_CURRENT', 'CONTEXT', 'INTERGENIC',\n", " 'RISK ALLELE FREQUENCY', 'P-VALUE', 'PVALUE_MLOG', 'P-VALUE (TEXT)',\n", " 'OR or BETA', '95% CI (TEXT)', 'PLATFORM [SNPS PASSING QC]', 'CNV',\n", " 'MAPPED_TRAIT', 'MAPPED_TRAIT_URI', 'STUDY ACCESSION',\n", " 'GENOTYPING TECHNOLOGY'],\n", " dtype='object')" ] }, "metadata": {} } ], "execution_count": 3, "metadata": {} }, { "cell_type": "code", "source": [ "#chr6:135252920-135391745\n", "#chr6:135089817-135228642\n", "#region = 'chr6:135252920-135421745'\n", "\n", "## Now we get all the SNPs within the MYB enhancer region\n", "region = 'chr6:135152920-135921745'\n", "chrom, left, right = re.split(':|-', region)\n", "snp_idx = []\n", "hits_df = pd.DataFrame()\n", "for i, row in GWAS_df.loc[(GWAS_df.CHR_ID == '6')].iterrows():\n", " try: \n", " pos = int(row.CHR_POS)\n", "# print(pos)\n", " if (pos > int(left)) & (pos < int(right)):\n", " snp_idx.append(i)\n", " hits_df = hits_df.append(row)\n", " sys.exit(0)\n", " except:\n", " next" ], "outputs": [], "execution_count": 5, "metadata": {} }, { "cell_type": "code", "source": [ "pd.set_option(\"display.max_rows\", 200)\n", "\n", "hits_df[['CHR_ID', 'CHR_POS', 'PVALUE_MLOG', 'DISEASE/TRAIT', 'SNPS', 'LINK']].sort_values(by='CHR_POS').head()" ], "outputs": [ { "output_type": "execute_result", "execution_count": 6, "data": { "text/plain": [ " CHR_ID CHR_POS PVALUE_MLOG DISEASE/TRAIT \\\n", "116684 6 135161428 9.301030 Balding type 1 \n", "105128 6 135165003 14.397940 Red cell distribution width \n", "24037 6 135173737 5.698970 Multiple sclerosis \n", "103354 6 135174088 252.522879 Mean corpuscular hemoglobin \n", "6928 6 135178322 8.221849 White blood cell count (basophil) \n", "\n", " SNPS LINK \n", "116684 rs6569999 www.ncbi.nlm.nih.gov/pubmed/30595370 \n", "105128 rs113617776 www.ncbi.nlm.nih.gov/pubmed/30595370 \n", "24037 rs9321490 www.ncbi.nlm.nih.gov/pubmed/21833088 \n", "103354 rs2327586 www.ncbi.nlm.nih.gov/pubmed/30595370 \n", "6928 rs9376098 www.ncbi.nlm.nih.gov/pubmed/27863252 " ], "text/html": [ "
\n", " | CHR_ID | \n", "CHR_POS | \n", "PVALUE_MLOG | \n", "DISEASE/TRAIT | \n", "SNPS | \n", "LINK | \n", "
---|---|---|---|---|---|---|
116684 | \n", "6 | \n", "135161428 | \n", "9.301030 | \n", "Balding type 1 | \n", "rs6569999 | \n", "www.ncbi.nlm.nih.gov/pubmed/30595370 | \n", "
105128 | \n", "6 | \n", "135165003 | \n", "14.397940 | \n", "Red cell distribution width | \n", "rs113617776 | \n", "www.ncbi.nlm.nih.gov/pubmed/30595370 | \n", "
24037 | \n", "6 | \n", "135173737 | \n", "5.698970 | \n", "Multiple sclerosis | \n", "rs9321490 | \n", "www.ncbi.nlm.nih.gov/pubmed/21833088 | \n", "
103354 | \n", "6 | \n", "135174088 | \n", "252.522879 | \n", "Mean corpuscular hemoglobin | \n", "rs2327586 | \n", "www.ncbi.nlm.nih.gov/pubmed/30595370 | \n", "
6928 | \n", "6 | \n", "135178322 | \n", "8.221849 | \n", "White blood cell count (basophil) | \n", "rs9376098 | \n", "www.ncbi.nlm.nih.gov/pubmed/27863252 | \n", "