{ "cells": [ { "cell_type": "code", "source": [ "#!/usr/bin/env python3\n", "import os\n", "import re\n", "import sys\n", "import collections\n", "import argparse\n", "#import tables\n", "import itertools\n", "import matplotlib\n", "import glob\n", "import math\n", "%matplotlib inline\n", "\n", "import matplotlib.pyplot as plt\n", "import matplotlib.gridspec as gridspec\n", "import numpy as np\n", "import pandas as pd\n", "import scipy.stats as stats\n", "import scipy.sparse as sp_sparse\n", "\n", "from multiprocessing import Pool\n", "from collections import defaultdict\n", "from scipy import sparse, io\n", "from scipy.sparse import csr_matrix\n", "from multiprocessing import Pool\n", "#from matplotlib_venn import venn2, venn2_circles\n", "matplotlib.rcParams['pdf.fonttype'] = 42\n", "matplotlib.rcParams['ps.fonttype'] = 42" ], "outputs": [], "execution_count": 1, "metadata": {} }, { "cell_type": "code", "source": [ "GWAS_df = pd.read_csv('./gwas_catalog_v1.0.2-associations_e95_r2019-03-01.tsv', sep='\\t', header=0, low_memory=False)" ], "outputs": [], "execution_count": 2, "metadata": {} }, { "cell_type": "code", "source": [ "GWAS_df.columns" ], "outputs": [ { "output_type": "execute_result", "execution_count": 3, "data": { "text/plain": [ "Index(['DATE ADDED TO CATALOG', 'PUBMEDID', 'FIRST AUTHOR', 'DATE', 'JOURNAL',\n", " 'LINK', 'STUDY', 'DISEASE/TRAIT', 'INITIAL SAMPLE SIZE',\n", " 'REPLICATION SAMPLE SIZE', 'REGION', 'CHR_ID', 'CHR_POS',\n", " 'REPORTED GENE(S)', 'MAPPED_GENE', 'UPSTREAM_GENE_ID',\n", " 'DOWNSTREAM_GENE_ID', 'SNP_GENE_IDS', 'UPSTREAM_GENE_DISTANCE',\n", " 'DOWNSTREAM_GENE_DISTANCE', 'STRONGEST SNP-RISK ALLELE', 'SNPS',\n", " 'MERGED', 'SNP_ID_CURRENT', 'CONTEXT', 'INTERGENIC',\n", " 'RISK ALLELE FREQUENCY', 'P-VALUE', 'PVALUE_MLOG', 'P-VALUE (TEXT)',\n", " 'OR or BETA', '95% CI (TEXT)', 'PLATFORM [SNPS PASSING QC]', 'CNV',\n", " 'MAPPED_TRAIT', 'MAPPED_TRAIT_URI', 'STUDY ACCESSION',\n", " 'GENOTYPING TECHNOLOGY'],\n", " dtype='object')" ] }, "metadata": {} } ], "execution_count": 3, "metadata": {} }, { "cell_type": "code", "source": [ "#chr6:135252920-135391745\n", "#chr6:135089817-135228642\n", "#region = 'chr6:135252920-135421745'\n", "\n", "## Now we get all the SNPs within the MYB enhancer region\n", "region = 'chr6:135152920-135921745'\n", "chrom, left, right = re.split(':|-', region)\n", "snp_idx = []\n", "hits_df = pd.DataFrame()\n", "for i, row in GWAS_df.loc[(GWAS_df.CHR_ID == '6')].iterrows():\n", " try: \n", " pos = int(row.CHR_POS)\n", "# print(pos)\n", " if (pos > int(left)) & (pos < int(right)):\n", " snp_idx.append(i)\n", " hits_df = hits_df.append(row)\n", " sys.exit(0)\n", " except:\n", " next" ], "outputs": [], "execution_count": 5, "metadata": {} }, { "cell_type": "code", "source": [ "pd.set_option(\"display.max_rows\", 200)\n", "\n", "hits_df[['CHR_ID', 'CHR_POS', 'PVALUE_MLOG', 'DISEASE/TRAIT', 'SNPS', 'LINK']].sort_values(by='CHR_POS').head()" ], "outputs": [ { "output_type": "execute_result", "execution_count": 6, "data": { "text/plain": [ " CHR_ID CHR_POS PVALUE_MLOG DISEASE/TRAIT \\\n", "116684 6 135161428 9.301030 Balding type 1 \n", "105128 6 135165003 14.397940 Red cell distribution width \n", "24037 6 135173737 5.698970 Multiple sclerosis \n", "103354 6 135174088 252.522879 Mean corpuscular hemoglobin \n", "6928 6 135178322 8.221849 White blood cell count (basophil) \n", "\n", " SNPS LINK \n", "116684 rs6569999 www.ncbi.nlm.nih.gov/pubmed/30595370 \n", "105128 rs113617776 www.ncbi.nlm.nih.gov/pubmed/30595370 \n", "24037 rs9321490 www.ncbi.nlm.nih.gov/pubmed/21833088 \n", "103354 rs2327586 www.ncbi.nlm.nih.gov/pubmed/30595370 \n", "6928 rs9376098 www.ncbi.nlm.nih.gov/pubmed/27863252 " ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CHR_IDCHR_POSPVALUE_MLOGDISEASE/TRAITSNPSLINK
11668461351614289.301030Balding type 1rs6569999www.ncbi.nlm.nih.gov/pubmed/30595370
105128613516500314.397940Red cell distribution widthrs113617776www.ncbi.nlm.nih.gov/pubmed/30595370
2403761351737375.698970Multiple sclerosisrs9321490www.ncbi.nlm.nih.gov/pubmed/21833088
1033546135174088252.522879Mean corpuscular hemoglobinrs2327586www.ncbi.nlm.nih.gov/pubmed/30595370
692861351783228.221849White blood cell count (basophil)rs9376098www.ncbi.nlm.nih.gov/pubmed/27863252
\n", "
" ] }, "metadata": {} } ], "execution_count": 6, "metadata": {} }, { "cell_type": "markdown", "source": [ "### Print a bed format file for all the SNPs" ], "metadata": {} }, { "cell_type": "code", "source": [ "final_hits_df = hits_df[['CHR_ID', 'CHR_POS', 'PVALUE_MLOG', 'DISEASE/TRAIT', 'SNPS']].sort_values(by='CHR_POS')\n", "\n", "snp_list = []\n", "for i, row in final_hits_df.iterrows():\n", " snp = '\\t'.join(row[['CHR_ID', 'CHR_POS', 'SNPS']].values)\n", " snp_list.append(snp)" ], "outputs": [], "execution_count": 9, "metadata": {} }, { "cell_type": "code", "source": [ "for i in np.unique(snp_list):\n", " chrom, left, snp = i.split('\\t')\n", " right = int(left) + 1\n", " print('chr', end = '')\n", " print('\\t'.join([chrom, str(int(left)-300), str(right+300), snp]))" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "chr6\t135161128\t135161729\trs6569999\n", "chr6\t135164703\t135165304\trs113617776\n", "chr6\t135173437\t135174038\trs9321490\n", "chr6\t135173788\t135174389\trs2327586\n", "chr6\t135178022\t135178623\trs9376098\n", "chr6\t135182347\t135182948\trs210962\n", "chr6\t135193392\t135193993\trs34931156\n", "chr6\t135201688\t135202289\trs181880988\n", "chr6\t135211846\t135212447\trs210946\n", "chr6\t135218942\t135219543\trs210937\n", "chr6\t135264540\t135265141\trs118038583\n", "chr6\t135304910\t135305511\trs6928977\n", "chr6\t135318206\t135318807\trs6914831\n", "chr6\t135328183\t135328784\trs146318841\n", "chr6\t135347593\t135348194\trs17064440\n", "chr6\t135347659\t135348260\trs2746427\n", "chr6\t135370078\t135370679\trs2246943\n", "chr6\t135375159\t135375760\trs2746432\n", "chr6\t135386048\t135386649\trs7773987\n", "chr6\t135391000\t135391601\trs12190426\n", "chr6\t135396630\t135397231\trs2746438\n", "chr6\t135399847\t135400448\trs13208164\n", "chr6\t135417917\t135418518\trs11154801\n", "chr6\t135538259\t135538860\trs9402715\n", "chr6\t135539603\t135540204\trs144124553\n", "chr6\t135568367\t135568968\trs12663042\n", "chr6\t135581161\t135581762\trs761357\n", "chr6\t135589473\t135590074\trs116990752\n", "chr6\t135629979\t135630580\trs2327650\n", "chr6\t135679565\t135680166\trs9389316\n", "chr6\t135679596\t135680197\trs9402743\n", "chr6\t135744375\t135744976\trs1119555\n", "chr6\t135746578\t135747179\trs4895455\n", "chr6\t135756491\t135757092\trs79846291\n", "chr6\t135812221\t135812822\trs947583\n", "chr6\t135840984\t135841585\trs117976152\n", "chr6\t135883067\t135883668\trs10872447\n", "chr6\t135892717\t135893318\trs2027518\n", "chr6\t135905816\t135906417\trs9376164\n", "chr6\t135907179\t135907780\trs78928932\n", "chr6\t135907237\t135907838\trs12209685\n", "chr6\t135909573\t135910174\trs12529766\n", "chr6\t135914130\t135914731\trs12208105\n" ] } ], "execution_count": 10, "metadata": {} }, { "cell_type": "markdown", "source": [ "### The major catalog of traits\n", "\n", "For similicity, we group all the traits to a few major catalogs, and then use WashU genome browser to visualize all of them. " ], "metadata": {} }, { "cell_type": "markdown", "source": [ "### Red blood cell related\n", "- Red blood cell count\n", "- Red cell distribution width\n", "- Mean corpuscular hemoglobin\n", "\n", "### Myeloid cell related\n", "- White blood cell count (basophil)\n", "- Neutrophil count\n", "- Sum basophil neutrophil counts\n", "- White blood cell count\n", "- Basophil percentage of granulocytes\n", "- Basophil percentage of white cells\n", "- Granulocyte count\n", "- Myeloid white cell count\n", "\n", "### Other Blood\n", "- Blood osmolality (transformed sodium)\n", "- Diastolic blood pressure\n", "- Selective IgA deficiency\n", "\n", "### Immune / Autoimmune\n", "- Autoimmune traits\n", "- Hodgkin's lymphoma\n", "- Multiple sclerosis\n", "- Systemic lupus erythematosus\n", "- Nodular sclerosis Hodgkin lymphoma\n", "- Immune reponse to smallpox (secreted IL-2)\n", "\n", "### Other\n", "- Balding type 1\n", "- Eczema\n", "- Heel bone mineral density\n", "- Hippocampal volume in Alzheimer's disease dementia\n", "- Lung function (FVC)\n", "- Menarche (age at onset)\n", "- Obesity-related traits\n", "- Peripheral arterial disease (traffic-related air pollution interaction)\n", "- Phosphorus levels\n", "- Respiratory diseases\n", "- Hypothyroidism" ], "metadata": {} } ], "metadata": { "kernelspec": { "name": "python3", "language": "python", "display_name": "Python 3" }, "language_info": { "name": "python", "version": "3.7.0", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" }, "kernel_info": { "name": "python3" }, "nteract": { "version": "0.14.0" } }, "nbformat": 4, "nbformat_minor": 2 }