{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# New Mapping File from NCBI Gene Dataset\n", "Downloaded: Jan 2, 2019" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import csv\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "file = \"/Users/maayan/sigsets/Harmonizome/Data/All_Data.gene_info_010219\"\n", "mapping_file = \"/Users/maayan/sigsets/Harmonizome/Data/mappingFileHMR_2019.tsv\"\n", "gene_sym_ids = \"/Users/maayan/sigsets/Harmonizome/Data/GeneSymbolsAndIDSHMR_2019.tsv\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Human, Mouse, and Rat only\n", "mapping_dict = {}\n", "with open(file) as o:\n", " with open(gene_sym_ids, \"w\") as w:\n", " with open(mapping_file, \"w\") as m:\n", " csv_file = csv.reader(o, delimiter=\"\\t\")\n", " w.write(\"Human, Mouse, and Rat Approved Symbol\tEntrez Gene ID(supplied by NCBI)\")\n", " for row in csv_file:\n", " tax_id = row[0]\n", " if tax_id in [\"9606\", \"10090\", \"10116\"]:\n", " gene_id = row[1]\n", " if not row[2] == \"NEWENTRY\":\n", " sym = row[2]\n", " else: \n", " continue\n", " synonyms = [sym]\n", " if not row[4] == \"-\":\n", " synonyms = synonyms + row[4].split(\"|\")\n", " else:\n", " continue\n", " w.write(\"\\t\".join([tax_id, sym, gene_id]) + \"\\n\") # Add Taxon ID\n", " if not tax_id in mapping_dict:\n", " mapping_dict[tax_id] = set([])\n", " for syn in synonyms:\n", " if syn not in mapping_dict[tax_id]:\n", " mapping_dict[tax_id].add(syn)\n", " m.write(\"\\t\".join([tax_id, syn, sym])+ \"\\n\")\n", " " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "getGeneIDsHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/GeneSymbolsAndIDSHMR_2019.tsv', sep='\\t', index_col=[0,1]).sort_index()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(9606,\"A2MP1\") in getGeneIDsHMR_updated.index" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "mappingDFHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/mappingFileHMR_2019.tsv', sep='\\t', header=None, index_col=[0,1]).sort_index()\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(9606,\"(FM-3)\") in mappingDFHMR_updated.index" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | \n", " | 2 | \n", "
---|---|---|
0 | \n", "1 | \n", "\n", " |
9606 | \n", "(FM-3) | \n", "NMUR1 | \n", "
(IV)-44 | \n", "IGHVIV-44-1 | \n", "|
(ppGpp)ase | \n", "HDDC3 | \n", "|
0610011B16Rik | \n", "CORO7 | \n", "|
0610037N12Rik | \n", "POP7 | \n", "|
0710008D09Rik | \n", "UQCR11 | \n", "|
0808y08y | \n", "NFYC-AS1 | \n", "|
1-12P | \n", "IGHV1-12 | \n", "|
1-14P | \n", "IGHV1-14 | \n", "|
1-17P | \n", "IGHV1-17 | \n", "|
1-67P | \n", "IGHV1-67 | \n", "|
1-68P | \n", "IGHV1-68 | \n", "|
1-8D | \n", "IFITM2 | \n", "|
1-8U | \n", "IFITM3 | \n", "|
1-AGPAT 6 | \n", "GPAT4 | \n", "|
1-AGPAT1 | \n", "AGPAT1 | \n", "|
1-AGPAT2 | \n", "AGPAT2 | \n", "|
1-AGPAT4 | \n", "AGPAT4 | \n", "|
1-Cys | \n", "PRDX6 | \n", "|
1/2-SBSRNA4 | \n", "SEC24B-AS1 | \n", "|
10-FTHFDH | \n", "ALDH1L1 | \n", "|
10-fTHF | \n", "ALDH1L1 | \n", "|
101F10.1 | \n", "KNOP1 | \n", "|
101F6 | \n", "CYB561D2 | \n", "|
104p | \n", "TUBGCP3 | \n", "|
105A | \n", "SNORA73B | \n", "|
105B | \n", "RNU105B | \n", "|
10C | \n", "ARHGAP9 | \n", "|
10q23del | \n", "BMPR1A | \n", "|
11-DH | \n", "HSD11B1 | \n", "|
... | \n", "... | \n", "... | \n", "
10116 | \n", "tpcr07 | \n", "Olr1398 | \n", "
tpcr09 | \n", "Olr737 | \n", "|
tpcr10 | \n", "Olr1404 | \n", "|
tpcr13 | \n", "Olr1366 | \n", "|
tpcr18 | \n", "Olr1307 | \n", "|
tpcr19 | \n", "Olr1226 | \n", "|
tpcr21 | \n", "Olr1283 | \n", "|
tpcr38 | \n", "Olr1606 | \n", "|
trk-B | \n", "Ntrk2 | \n", "|
trkB | \n", "Ntrk2 | \n", "|
trkC | \n", "Ntrk3 | \n", "|
try1 | \n", "Prss58 | \n", "|
uKATP-1 | \n", "Kcnj8 | \n", "|
uPAR | \n", "Plaur | \n", "|
uPAR-2 | \n", "Plaur | \n", "|
uPAR-3 | \n", "Plaur | \n", "|
ufc1-s | \n", "Ufc1 | \n", "|
ufd2a | \n", "Ube4b | \n", "|
upf0227 | \n", "Abhd17a | \n", "|
vip/phi27 | \n", "Vip | \n", "|
vms-tm2 | \n", "Cd99l2 | \n", "|
wbp-11 | \n", "Wbp11 | \n", "|
x85 | \n", "Dus1l | \n", "|
xylt-II | \n", "Xylt2 | \n", "|
y+LAT1 | \n", "Slc7a7 | \n", "|
zbs559 | \n", "Map1lc3b | \n", "|
zgc:101121 | \n", "Zfand6 | \n", "|
zgc:66482 | \n", "Pik3ip1 | \n", "|
zif-268 | \n", "Egr1 | \n", "|
NaN | \n", "Scn11a | \n", "
329092 rows × 1 columns
\n", "