{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# New Mapping File from NCBI Gene Dataset\n", "Downloaded: Jan 2, 2019" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import csv\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "file = \"/Users/maayan/sigsets/Harmonizome/Data/All_Data.gene_info_010219\"\n", "mapping_file = \"/Users/maayan/sigsets/Harmonizome/Data/mappingFileHMR_2019.tsv\"\n", "gene_sym_ids = \"/Users/maayan/sigsets/Harmonizome/Data/GeneSymbolsAndIDSHMR_2019.tsv\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Human, Mouse, and Rat only\n", "mapping_dict = {}\n", "with open(file) as o:\n", " with open(gene_sym_ids, \"w\") as w:\n", " with open(mapping_file, \"w\") as m:\n", " csv_file = csv.reader(o, delimiter=\"\\t\")\n", " w.write(\"Human, Mouse, and Rat Approved Symbol\tEntrez Gene ID(supplied by NCBI)\")\n", " for row in csv_file:\n", " tax_id = row[0]\n", " if tax_id in [\"9606\", \"10090\", \"10116\"]:\n", " gene_id = row[1]\n", " if not row[2] == \"NEWENTRY\":\n", " sym = row[2]\n", " else: \n", " continue\n", " synonyms = [sym]\n", " if not row[4] == \"-\":\n", " synonyms = synonyms + row[4].split(\"|\")\n", " else:\n", " continue\n", " w.write(\"\\t\".join([tax_id, sym, gene_id]) + \"\\n\") # Add Taxon ID\n", " if not tax_id in mapping_dict:\n", " mapping_dict[tax_id] = set([])\n", " for syn in synonyms:\n", " if syn not in mapping_dict[tax_id]:\n", " mapping_dict[tax_id].add(syn)\n", " m.write(\"\\t\".join([tax_id, syn, sym])+ \"\\n\")\n", " " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "getGeneIDsHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/GeneSymbolsAndIDSHMR_2019.tsv', sep='\\t', index_col=[0,1]).sort_index()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(9606,\"A2MP1\") in getGeneIDsHMR_updated.index" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "mappingDFHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/mappingFileHMR_2019.tsv', sep='\\t', header=None, index_col=[0,1]).sort_index()\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(9606,\"(FM-3)\") in mappingDFHMR_updated.index" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
2
01
9606(FM-3)NMUR1
(IV)-44IGHVIV-44-1
(ppGpp)aseHDDC3
0610011B16RikCORO7
0610037N12RikPOP7
0710008D09RikUQCR11
0808y08yNFYC-AS1
1-12PIGHV1-12
1-14PIGHV1-14
1-17PIGHV1-17
1-67PIGHV1-67
1-68PIGHV1-68
1-8DIFITM2
1-8UIFITM3
1-AGPAT 6GPAT4
1-AGPAT1AGPAT1
1-AGPAT2AGPAT2
1-AGPAT4AGPAT4
1-CysPRDX6
1/2-SBSRNA4SEC24B-AS1
10-FTHFDHALDH1L1
10-fTHFALDH1L1
101F10.1KNOP1
101F6CYB561D2
104pTUBGCP3
105ASNORA73B
105BRNU105B
10CARHGAP9
10q23delBMPR1A
11-DHHSD11B1
.........
10116tpcr07Olr1398
tpcr09Olr737
tpcr10Olr1404
tpcr13Olr1366
tpcr18Olr1307
tpcr19Olr1226
tpcr21Olr1283
tpcr38Olr1606
trk-BNtrk2
trkBNtrk2
trkCNtrk3
try1Prss58
uKATP-1Kcnj8
uPARPlaur
uPAR-2Plaur
uPAR-3Plaur
ufc1-sUfc1
ufd2aUbe4b
upf0227Abhd17a
vip/phi27Vip
vms-tm2Cd99l2
wbp-11Wbp11
x85Dus1l
xylt-IIXylt2
y+LAT1Slc7a7
zbs559Map1lc3b
zgc:101121Zfand6
zgc:66482Pik3ip1
zif-268Egr1
NaNScn11a
\n", "

329092 rows × 1 columns

\n", "
" ], "text/plain": [ " 2\n", "0 1 \n", "9606 (FM-3) NMUR1\n", " (IV)-44 IGHVIV-44-1\n", " (ppGpp)ase HDDC3\n", " 0610011B16Rik CORO7\n", " 0610037N12Rik POP7\n", " 0710008D09Rik UQCR11\n", " 0808y08y NFYC-AS1\n", " 1-12P IGHV1-12\n", " 1-14P IGHV1-14\n", " 1-17P IGHV1-17\n", " 1-67P IGHV1-67\n", " 1-68P IGHV1-68\n", " 1-8D IFITM2\n", " 1-8U IFITM3\n", " 1-AGPAT 6 GPAT4\n", " 1-AGPAT1 AGPAT1\n", " 1-AGPAT2 AGPAT2\n", " 1-AGPAT4 AGPAT4\n", " 1-Cys PRDX6\n", " 1/2-SBSRNA4 SEC24B-AS1\n", " 10-FTHFDH ALDH1L1\n", " 10-fTHF ALDH1L1\n", " 101F10.1 KNOP1\n", " 101F6 CYB561D2\n", " 104p TUBGCP3\n", " 105A SNORA73B\n", " 105B RNU105B\n", " 10C ARHGAP9\n", " 10q23del BMPR1A\n", " 11-DH HSD11B1\n", "... ...\n", "10116 tpcr07 Olr1398\n", " tpcr09 Olr737\n", " tpcr10 Olr1404\n", " tpcr13 Olr1366\n", " tpcr18 Olr1307\n", " tpcr19 Olr1226\n", " tpcr21 Olr1283\n", " tpcr38 Olr1606\n", " trk-B Ntrk2\n", " trkB Ntrk2\n", " trkC Ntrk3\n", " try1 Prss58\n", " uKATP-1 Kcnj8\n", " uPAR Plaur\n", " uPAR-2 Plaur\n", " uPAR-3 Plaur\n", " ufc1-s Ufc1\n", " ufd2a Ube4b\n", " upf0227 Abhd17a\n", " vip/phi27 Vip\n", " vms-tm2 Cd99l2\n", " wbp-11 Wbp11\n", " x85 Dus1l\n", " xylt-II Xylt2\n", " y+LAT1 Slc7a7\n", " zbs559 Map1lc3b\n", " zgc:101121 Zfand6\n", " zgc:66482 Pik3ip1\n", " zif-268 Egr1\n", " NaN Scn11a\n", "\n", "[329092 rows x 1 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mappingDFHMR_updated" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "pvenv", "language": "python", "name": "pvenv" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }