{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# New Mapping File from NCBI Gene Dataset\n", "Downloaded: Jan 2, 2019" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import csv\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "file = \"/Users/maayan/sigsets/Harmonizome/Data/All_Data.gene_info_010219\"\n", "mapping_file = \"/Users/maayan/sigsets/Harmonizome/Data/mappingFileHMR_2019.tsv\"\n", "gene_sym_ids = \"/Users/maayan/sigsets/Harmonizome/Data/GeneSymbolsAndIDSHMR_2019.tsv\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Human, Mouse, and Rat only\n", "mapping_dict = {}\n", "with open(file) as o:\n", " with open(gene_sym_ids, \"w\") as w:\n", " with open(mapping_file, \"w\") as m:\n", " csv_file = csv.reader(o, delimiter=\"\\t\")\n", " w.write(\"Human, Mouse, and Rat Approved Symbol\tEntrez Gene ID(supplied by NCBI)\")\n", " for row in csv_file:\n", " tax_id = row[0]\n", " if tax_id in [\"9606\", \"10090\", \"10116\"]:\n", " gene_id = row[1]\n", " if not row[2] == \"NEWENTRY\":\n", " sym = row[2]\n", " else: \n", " continue\n", " synonyms = [sym]\n", " if not row[4] == \"-\":\n", " synonyms = synonyms + row[4].split(\"|\")\n", " else:\n", " continue\n", " w.write(\"\\t\".join([tax_id, sym, gene_id]) + \"\\n\") # Add Taxon ID\n", " if not tax_id in mapping_dict:\n", " mapping_dict[tax_id] = set([])\n", " for syn in synonyms:\n", " if syn not in mapping_dict[tax_id]:\n", " mapping_dict[tax_id].add(syn)\n", " m.write(\"\\t\".join([tax_id, syn, sym])+ \"\\n\")\n", " " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "getGeneIDsHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/GeneSymbolsAndIDSHMR_2019.tsv', sep='\\t', index_col=[0,1]).sort_index()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(9606,\"A2MP1\") in getGeneIDsHMR_updated.index" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "mappingDFHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/mappingFileHMR_2019.tsv', sep='\\t', header=None, index_col=[0,1]).sort_index()\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(9606,\"(FM-3)\") in mappingDFHMR_updated.index" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | \n", " | 2 | \n", "
|---|---|---|
| 0 | \n", "1 | \n", "\n", " |
| 9606 | \n", "(FM-3) | \n", "NMUR1 | \n", "
| (IV)-44 | \n", "IGHVIV-44-1 | \n", "|
| (ppGpp)ase | \n", "HDDC3 | \n", "|
| 0610011B16Rik | \n", "CORO7 | \n", "|
| 0610037N12Rik | \n", "POP7 | \n", "|
| 0710008D09Rik | \n", "UQCR11 | \n", "|
| 0808y08y | \n", "NFYC-AS1 | \n", "|
| 1-12P | \n", "IGHV1-12 | \n", "|
| 1-14P | \n", "IGHV1-14 | \n", "|
| 1-17P | \n", "IGHV1-17 | \n", "|
| 1-67P | \n", "IGHV1-67 | \n", "|
| 1-68P | \n", "IGHV1-68 | \n", "|
| 1-8D | \n", "IFITM2 | \n", "|
| 1-8U | \n", "IFITM3 | \n", "|
| 1-AGPAT 6 | \n", "GPAT4 | \n", "|
| 1-AGPAT1 | \n", "AGPAT1 | \n", "|
| 1-AGPAT2 | \n", "AGPAT2 | \n", "|
| 1-AGPAT4 | \n", "AGPAT4 | \n", "|
| 1-Cys | \n", "PRDX6 | \n", "|
| 1/2-SBSRNA4 | \n", "SEC24B-AS1 | \n", "|
| 10-FTHFDH | \n", "ALDH1L1 | \n", "|
| 10-fTHF | \n", "ALDH1L1 | \n", "|
| 101F10.1 | \n", "KNOP1 | \n", "|
| 101F6 | \n", "CYB561D2 | \n", "|
| 104p | \n", "TUBGCP3 | \n", "|
| 105A | \n", "SNORA73B | \n", "|
| 105B | \n", "RNU105B | \n", "|
| 10C | \n", "ARHGAP9 | \n", "|
| 10q23del | \n", "BMPR1A | \n", "|
| 11-DH | \n", "HSD11B1 | \n", "|
| ... | \n", "... | \n", "... | \n", "
| 10116 | \n", "tpcr07 | \n", "Olr1398 | \n", "
| tpcr09 | \n", "Olr737 | \n", "|
| tpcr10 | \n", "Olr1404 | \n", "|
| tpcr13 | \n", "Olr1366 | \n", "|
| tpcr18 | \n", "Olr1307 | \n", "|
| tpcr19 | \n", "Olr1226 | \n", "|
| tpcr21 | \n", "Olr1283 | \n", "|
| tpcr38 | \n", "Olr1606 | \n", "|
| trk-B | \n", "Ntrk2 | \n", "|
| trkB | \n", "Ntrk2 | \n", "|
| trkC | \n", "Ntrk3 | \n", "|
| try1 | \n", "Prss58 | \n", "|
| uKATP-1 | \n", "Kcnj8 | \n", "|
| uPAR | \n", "Plaur | \n", "|
| uPAR-2 | \n", "Plaur | \n", "|
| uPAR-3 | \n", "Plaur | \n", "|
| ufc1-s | \n", "Ufc1 | \n", "|
| ufd2a | \n", "Ube4b | \n", "|
| upf0227 | \n", "Abhd17a | \n", "|
| vip/phi27 | \n", "Vip | \n", "|
| vms-tm2 | \n", "Cd99l2 | \n", "|
| wbp-11 | \n", "Wbp11 | \n", "|
| x85 | \n", "Dus1l | \n", "|
| xylt-II | \n", "Xylt2 | \n", "|
| y+LAT1 | \n", "Slc7a7 | \n", "|
| zbs559 | \n", "Map1lc3b | \n", "|
| zgc:101121 | \n", "Zfand6 | \n", "|
| zgc:66482 | \n", "Pik3ip1 | \n", "|
| zif-268 | \n", "Egr1 | \n", "|
| NaN | \n", "Scn11a | \n", "
329092 rows × 1 columns
\n", "