{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import os\n",
"import gzip\n",
"import re\n",
"import pandas"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2015-06-07 14:50:22-- ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz\n",
" => ‘download/.listing’\n",
"Resolving ftp.ncbi.nih.gov (ftp.ncbi.nih.gov)... 130.14.250.7, 2607:f220:41e:250::10\n",
"Connecting to ftp.ncbi.nih.gov (ftp.ncbi.nih.gov)|130.14.250.7|:21... connected.\n",
"Logging in as anonymous ... Logged in!\n",
"==> SYST ... done. ==> PWD ... done.\n",
"==> TYPE I ... done. ==> CWD (1) /gene/DATA/GENE_INFO/Mammalia ... done.\n",
"==> PASV ... done. ==> LIST ... done.\n",
"\n",
" [ <=> ] 672 --.-K/s in 0.01s \n",
"\n",
"2015-06-07 14:50:23 (56.9 KB/s) - ‘download/.listing’ saved [672]\n",
"\n",
"Removed ‘download/.listing’.\n",
"Remote file no newer than local file ‘download/Homo_sapiens.gene_info.gz’ -- not retrieving.\n"
]
}
],
"source": [
"# Download human entrez gene information\n",
"url = 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz'\n",
"! wget --timestamping --directory-prefix download/ $url"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"56425"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Read Entrez info dataset\n",
"path = os.path.join('download', 'Homo_sapiens.gene_info.gz')\n",
"\n",
"with gzip.open(path, 'rt') as read_file:\n",
" matches = re.match(r'#Format: (.+) \\(', next(read_file))\n",
" columns = matches.group(1).split(' ')\n",
" gene_df = pandas.read_table(read_file, names = columns, na_values=['-'])\n",
"\n",
"len(gene_df)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# extract symbols and xrefs\n",
"xref_rows = list()\n",
"symbol_rows = list()\n",
"\n",
"for i, series in gene_df.iterrows():\n",
" gene_id = series.GeneID\n",
" \n",
" # symbols\n",
" symbol = series.Symbol\n",
" if pandas.notnull(symbol):\n",
" symbol_rows.append((gene_id, 'symbol', symbol))\n",
" \n",
" # synonyms\n",
" synonyms = series.Synonyms\n",
" if pandas.notnull(synonyms):\n",
" for synonym in synonyms.split('|'):\n",
" symbol_rows.append((gene_id, 'synonym', synonym))\n",
"\n",
" # xrefs\n",
" dbXrefs = series.dbXrefs\n",
" if pandas.notnull(dbXrefs):\n",
" for xref in dbXrefs.split('|'):\n",
" db, ref = xref.split(':', 1)\n",
" xref_rows.append((gene_id, db, ref))\n",
"\n",
"xref_df = pandas.DataFrame(xref_rows, columns=['GeneID', 'resource', 'identifier'])\n",
"xref_df.to_csv('data/xrefs-human.tsv', sep='\\t', index=False)\n",
"\n",
"symbol_df = pandas.DataFrame(symbol_rows, columns=['GeneID', 'type', 'symbol'])\n",
"symbol_df.to_csv('data/symbols-human.tsv', sep='\\t', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" GeneID | \n",
" resource | \n",
" identifier | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" MIM | \n",
" 138670 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" HGNC | \n",
" HGNC:5 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" Ensembl | \n",
" ENSG00000121410 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" HPRD | \n",
" 00726 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" Vega | \n",
" OTTHUMG00000183507 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" GeneID resource identifier\n",
"0 1 MIM 138670\n",
"1 1 HGNC HGNC:5\n",
"2 1 Ensembl ENSG00000121410\n",
"3 1 HPRD 00726\n",
"4 1 Vega OTTHUMG00000183507"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"xref_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" GeneID | \n",
" type | \n",
" symbol | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" symbol | \n",
" A1BG | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" synonym | \n",
" A1B | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" synonym | \n",
" ABG | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" synonym | \n",
" GAB | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" synonym | \n",
" HYST2477 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" GeneID type symbol\n",
"0 1 symbol A1BG\n",
"1 1 synonym A1B\n",
"2 1 synonym ABG\n",
"3 1 synonym GAB\n",
"4 1 synonym HYST2477"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"symbol_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tax_id | \n",
" GeneID | \n",
" Symbol | \n",
" chromosome | \n",
" map_location | \n",
" type_of_gene | \n",
" description | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 9606 | \n",
" 1 | \n",
" A1BG | \n",
" 19 | \n",
" 19q13.4 | \n",
" protein-coding | \n",
" alpha-1-B glycoprotein | \n",
"
\n",
" \n",
" 1 | \n",
" 9606 | \n",
" 2 | \n",
" A2M | \n",
" 12 | \n",
" 12p13.31 | \n",
" protein-coding | \n",
" alpha-2-macroglobulin | \n",
"
\n",
" \n",
" 2 | \n",
" 9606 | \n",
" 3 | \n",
" A2MP1 | \n",
" 12 | \n",
" 12p13.31 | \n",
" pseudo | \n",
" alpha-2-macroglobulin pseudogene 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 9606 | \n",
" 9 | \n",
" NAT1 | \n",
" 8 | \n",
" 8p22 | \n",
" protein-coding | \n",
" N-acetyltransferase 1 (arylamine N-acetyltrans... | \n",
"
\n",
" \n",
" 4 | \n",
" 9606 | \n",
" 10 | \n",
" NAT2 | \n",
" 8 | \n",
" 8p22 | \n",
" protein-coding | \n",
" N-acetyltransferase 2 (arylamine N-acetyltrans... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tax_id GeneID Symbol chromosome map_location type_of_gene \\\n",
"0 9606 1 A1BG 19 19q13.4 protein-coding \n",
"1 9606 2 A2M 12 12p13.31 protein-coding \n",
"2 9606 3 A2MP1 12 12p13.31 pseudo \n",
"3 9606 9 NAT1 8 8p22 protein-coding \n",
"4 9606 10 NAT2 8 8p22 protein-coding \n",
"\n",
" description \n",
"0 alpha-1-B glycoprotein \n",
"1 alpha-2-macroglobulin \n",
"2 alpha-2-macroglobulin pseudogene 1 \n",
"3 N-acetyltransferase 1 (arylamine N-acetyltrans... \n",
"4 N-acetyltransferase 2 (arylamine N-acetyltrans... "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# save a select columnset\n",
"columns = ['tax_id', 'GeneID', 'Symbol', 'chromosome', 'map_location', 'type_of_gene', 'description']\n",
"select_df = gene_df[columns]\n",
"select_df.to_csv('data/genes-human.tsv', sep='\\t', index=False)\n",
"select_df.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}