{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import os\n", "import gzip\n", "import re\n", "import pandas" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2015-06-07 14:50:22-- ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz\n", " => ‘download/.listing’\n", "Resolving ftp.ncbi.nih.gov (ftp.ncbi.nih.gov)... 130.14.250.7, 2607:f220:41e:250::10\n", "Connecting to ftp.ncbi.nih.gov (ftp.ncbi.nih.gov)|130.14.250.7|:21... connected.\n", "Logging in as anonymous ... Logged in!\n", "==> SYST ... done. ==> PWD ... done.\n", "==> TYPE I ... done. ==> CWD (1) /gene/DATA/GENE_INFO/Mammalia ... done.\n", "==> PASV ... done. ==> LIST ... done.\n", "\n", " [ <=> ] 672 --.-K/s in 0.01s \n", "\n", "2015-06-07 14:50:23 (56.9 KB/s) - ‘download/.listing’ saved [672]\n", "\n", "Removed ‘download/.listing’.\n", "Remote file no newer than local file ‘download/Homo_sapiens.gene_info.gz’ -- not retrieving.\n" ] } ], "source": [ "# Download human entrez gene information\n", "url = 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz'\n", "! wget --timestamping --directory-prefix download/ $url" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "56425" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Read Entrez info dataset\n", "path = os.path.join('download', 'Homo_sapiens.gene_info.gz')\n", "\n", "with gzip.open(path, 'rt') as read_file:\n", " matches = re.match(r'#Format: (.+) \\(', next(read_file))\n", " columns = matches.group(1).split(' ')\n", " gene_df = pandas.read_table(read_file, names = columns, na_values=['-'])\n", "\n", "len(gene_df)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# extract symbols and xrefs\n", "xref_rows = list()\n", "symbol_rows = list()\n", "\n", "for i, series in gene_df.iterrows():\n", " gene_id = series.GeneID\n", " \n", " # symbols\n", " symbol = series.Symbol\n", " if pandas.notnull(symbol):\n", " symbol_rows.append((gene_id, 'symbol', symbol))\n", " \n", " # synonyms\n", " synonyms = series.Synonyms\n", " if pandas.notnull(synonyms):\n", " for synonym in synonyms.split('|'):\n", " symbol_rows.append((gene_id, 'synonym', synonym))\n", "\n", " # xrefs\n", " dbXrefs = series.dbXrefs\n", " if pandas.notnull(dbXrefs):\n", " for xref in dbXrefs.split('|'):\n", " db, ref = xref.split(':', 1)\n", " xref_rows.append((gene_id, db, ref))\n", "\n", "xref_df = pandas.DataFrame(xref_rows, columns=['GeneID', 'resource', 'identifier'])\n", "xref_df.to_csv('data/xrefs-human.tsv', sep='\\t', index=False)\n", "\n", "symbol_df = pandas.DataFrame(symbol_rows, columns=['GeneID', 'type', 'symbol'])\n", "symbol_df.to_csv('data/symbols-human.tsv', sep='\\t', index=False)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GeneIDresourceidentifier
01MIM138670
11HGNCHGNC:5
21EnsemblENSG00000121410
31HPRD00726
41VegaOTTHUMG00000183507
\n", "
" ], "text/plain": [ " GeneID resource identifier\n", "0 1 MIM 138670\n", "1 1 HGNC HGNC:5\n", "2 1 Ensembl ENSG00000121410\n", "3 1 HPRD 00726\n", "4 1 Vega OTTHUMG00000183507" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xref_df.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GeneIDtypesymbol
01symbolA1BG
11synonymA1B
21synonymABG
31synonymGAB
41synonymHYST2477
\n", "
" ], "text/plain": [ " GeneID type symbol\n", "0 1 symbol A1BG\n", "1 1 synonym A1B\n", "2 1 synonym ABG\n", "3 1 synonym GAB\n", "4 1 synonym HYST2477" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "symbol_df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tax_idGeneIDSymbolchromosomemap_locationtype_of_genedescription
096061A1BG1919q13.4protein-codingalpha-1-B glycoprotein
196062A2M1212p13.31protein-codingalpha-2-macroglobulin
296063A2MP11212p13.31pseudoalpha-2-macroglobulin pseudogene 1
396069NAT188p22protein-codingN-acetyltransferase 1 (arylamine N-acetyltrans...
4960610NAT288p22protein-codingN-acetyltransferase 2 (arylamine N-acetyltrans...
\n", "
" ], "text/plain": [ " tax_id GeneID Symbol chromosome map_location type_of_gene \\\n", "0 9606 1 A1BG 19 19q13.4 protein-coding \n", "1 9606 2 A2M 12 12p13.31 protein-coding \n", "2 9606 3 A2MP1 12 12p13.31 pseudo \n", "3 9606 9 NAT1 8 8p22 protein-coding \n", "4 9606 10 NAT2 8 8p22 protein-coding \n", "\n", " description \n", "0 alpha-1-B glycoprotein \n", "1 alpha-2-macroglobulin \n", "2 alpha-2-macroglobulin pseudogene 1 \n", "3 N-acetyltransferase 1 (arylamine N-acetyltrans... \n", "4 N-acetyltransferase 2 (arylamine N-acetyltrans... " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# save a select columnset\n", "columns = ['tax_id', 'GeneID', 'Symbol', 'chromosome', 'map_location', 'type_of_gene', 'description']\n", "select_df = gene_df[columns]\n", "select_df.to_csv('data/genes-human.tsv', sep='\\t', index=False)\n", "select_df.head()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" } }, "nbformat": 4, "nbformat_minor": 0 }