{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "d=pd.read_csv('histone_genes.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Histone typeHistone variantHGNC symbolNCBI gene IDEnsembl gene IDExpr. timingExpr. patternBiotypeBona fide canonicalPMIDs
0H1H1.0H1-03005ENSG00000189060RINaNCODNaN26689747
1H1H1.1H1-13024ENSG00000124610RDNaNCODNaN26689747
2H1H1.2H1-23006ENSG00000187837MixedNaNCODNaN26689747
3H1H1.3H1-33007ENSG00000124575RDNaNCODNaN26689747
4H1H1.4H1-43008ENSG00000168298RDNaNCODNaN26689747
5H1H1.5H1-53009ENSG00000184357RDNaNCODNaN26689747
6H1TS H1.6H1-63010ENSG00000187475RDTSCODNaN26689747
7H1TS H1.7H1-7341567ENSG00000187166RITSCODNaN26689747
8H1OO H1.8H1-8132243ENSG00000178804RIOOCODNaN26689747
9H1TS H1.9(?)H1-9P373861ENSG00000188662RITSCODNaN12920187 26689747 17852044
10H1H1.10H1-108971ENSG00000184897RINaNCODNaN26689747
11H1NaNH1-12P387325ENSG00000216331NaNNaNPSNaNNaN
12H2ATS H2A.1H2AC1221613ENSG00000164508MixedTSCODNaN2011515 7068607 24506885
13H2ANaNH2AC2P387319ENSG00000216436NaNNaNPScanonical12408966 25731851
14H2ANaNH2AC3P85303ENSG00000242387NaNNaNPSNaNNaN
15H2Acanonical H2AH2AC48335ENSG00000278463RDNaNCODcanonical12408966 25731851
16H2ANaNH2AC5P10341ENSG00000234816NaNNaNPSNaNNaN
17H2Acanonical H2AH2AC68334ENSG00000180573RDNaNCODcanonical12408966 25731851
18H2Acanonical H2AH2AC73013ENSG00000196866RDNaNCODcanonical12408966 25731851
19H2Acanonical H2AH2AC83012ENSG00000277075RDNaNCODcanonical12408966 25731851
20H2ANaNH2AC9P387323ENSG00000218281NaNNaNPSNaNNaN
21H2ANaNH2AC10P8333ENSG00000218690NaNNaNPSNaNNaN
22H2Acanonical H2AH2AC118969ENSG00000196787RDNaNCODcanonical12408966 25731851
23H2Acanonical H2AH2AC1285235ENSG00000274997RDNaNCODcanonical12408966 25731851
24H2Acanonical H2AH2AC138329ENSG00000196747RDNaNCODcanonical12408966 25731851
25H2Acanonical H2AH2AC148331ENSG00000276368RDNaNCODcanonical12408966 25731851
26H2Acanonical H2AH2AC158330ENSG00000275221RDNaNCODcanonical12408966 25731851
27H2Acanonical H2AH2AC168332ENSG00000276903RDNaNCODcanonical12408966 25731851
28H2Acanonical H2AH2AC178336ENSG00000278677RDNaNCODcanonical12408966 25731851
29H2Acanonical H2AH2AC188337ENSG00000203812RDNaNCODcanonical12408966 25731851
.................................
103H3canonical H3.2H3C14126961ENSG00000203811RDNaNCODcanonical12408966
104H3canonical H3.1H3C15333932ENSG00000203852RDNaNCODcanonical12408966
105H3H3.Y.1H3Y1391769ENSG00000269466RINaNCODNaN20819935
106H3H3.Y.2H3Y2340096ENSG00000268799RINaNCODNaN20819935
107H3canonical H3(?)H3-2440686ENSG00000273213RDNaNCODNaN12408966
108H3H3.3H3-3A3020ENSG00000163041RINaNCODNaN19412883
109H3H3.3H3-3B3021ENSG00000132475RINaNCODNaN19412883
110H3TS H3.4H3-48290ENSG00000168148RITSCODNaN8986613
111H3H3.5H3-5440093ENSG00000188375RITSCODNaN21274551
112H3cenH3CENPA1058ENSG00000115163RINaNCODNaN23324462
113H3NaNH3P2610338ENSG00000224447NaNNaNPSNaNNaN
114H3NaNH3P4106479023ENSG00000213244NaNNaNPSNaNNaN
115H3NaNH3P37664611ENSG00000270433NaNNaNPSNaNNaN
116H3NaNH3P38654505ENSG00000259389NaNNaNPSNaNNaN
117H4canonical H4H4C18359ENSG00000278637RDNaNCODcanonical12408966
118H4canonical H4H4C28366ENSG00000278705RDNaNCODcanonical12408966
119H4canonical H4H4C38364ENSG00000197061RDNaNCODcanonical12408966
120H4canonical H4H4C48360ENSG00000277157RDNaNCODcanonical12408966
121H4canonical H4H4C58367ENSG00000276966RDNaNCODcanonical12408966
122H4canonical H4H4C68361ENSG00000274618RDNaNCODcanonical12408966
123H4canonical H4H4C78369ENSG00000275663RDNaNCODcanonical12408966
124H4canonical H4H4C88365ENSG00000158406RDNaNCODcanonical12408966
125H4canonical H4H4C98294ENSG00000276180RDNaNCODcanonical12408966
126H4NaNH4C10P10337ENSG00000217862NaNNaNPSNaNNaN
127H4canonical H4H4C118363ENSG00000197238RDNaNCODcanonical12408966
128H4canonical H4H4C128362ENSG00000273542RDNaNCODcanonical12408966
129H4canonical H4H4C138368ENSG00000275126RDNaNCODcanonical12408966
130H4canonical H4H4C148370ENSG00000270882RDNaNCODcanonical12408966
131H4canonical H4H4C15554313ENSG00000270276RDNaNCODcanonical12408966
132H4canonical H4H4-16121504ENSG00000197837RDNaNCODcanonical12408966
\n", "

133 rows × 10 columns

\n", "
" ], "text/plain": [ " Histone type Histone variant HGNC symbol NCBI gene ID Ensembl gene ID \\\n", "0 H1 H1.0 H1-0 3005 ENSG00000189060 \n", "1 H1 H1.1 H1-1 3024 ENSG00000124610 \n", "2 H1 H1.2 H1-2 3006 ENSG00000187837 \n", "3 H1 H1.3 H1-3 3007 ENSG00000124575 \n", "4 H1 H1.4 H1-4 3008 ENSG00000168298 \n", "5 H1 H1.5 H1-5 3009 ENSG00000184357 \n", "6 H1 TS H1.6 H1-6 3010 ENSG00000187475 \n", "7 H1 TS H1.7 H1-7 341567 ENSG00000187166 \n", "8 H1 OO H1.8 H1-8 132243 ENSG00000178804 \n", "9 H1 TS H1.9(?) H1-9P 373861 ENSG00000188662 \n", "10 H1 H1.10 H1-10 8971 ENSG00000184897 \n", "11 H1 NaN H1-12P 387325 ENSG00000216331 \n", "12 H2A TS H2A.1 H2AC1 221613 ENSG00000164508 \n", "13 H2A NaN H2AC2P 387319 ENSG00000216436 \n", "14 H2A NaN H2AC3P 85303 ENSG00000242387 \n", "15 H2A canonical H2A H2AC4 8335 ENSG00000278463 \n", "16 H2A NaN H2AC5P 10341 ENSG00000234816 \n", "17 H2A canonical H2A H2AC6 8334 ENSG00000180573 \n", "18 H2A canonical H2A H2AC7 3013 ENSG00000196866 \n", "19 H2A canonical H2A H2AC8 3012 ENSG00000277075 \n", "20 H2A NaN H2AC9P 387323 ENSG00000218281 \n", "21 H2A NaN H2AC10P 8333 ENSG00000218690 \n", "22 H2A canonical H2A H2AC11 8969 ENSG00000196787 \n", "23 H2A canonical H2A H2AC12 85235 ENSG00000274997 \n", "24 H2A canonical H2A H2AC13 8329 ENSG00000196747 \n", "25 H2A canonical H2A H2AC14 8331 ENSG00000276368 \n", "26 H2A canonical H2A H2AC15 8330 ENSG00000275221 \n", "27 H2A canonical H2A H2AC16 8332 ENSG00000276903 \n", "28 H2A canonical H2A H2AC17 8336 ENSG00000278677 \n", "29 H2A canonical H2A H2AC18 8337 ENSG00000203812 \n", ".. ... ... ... ... ... \n", "103 H3 canonical H3.2 H3C14 126961 ENSG00000203811 \n", "104 H3 canonical H3.1 H3C15 333932 ENSG00000203852 \n", "105 H3 H3.Y.1 H3Y1 391769 ENSG00000269466 \n", "106 H3 H3.Y.2 H3Y2 340096 ENSG00000268799 \n", "107 H3 canonical H3(?) H3-2 440686 ENSG00000273213 \n", "108 H3 H3.3 H3-3A 3020 ENSG00000163041 \n", "109 H3 H3.3 H3-3B 3021 ENSG00000132475 \n", "110 H3 TS H3.4 H3-4 8290 ENSG00000168148 \n", "111 H3 H3.5 H3-5 440093 ENSG00000188375 \n", "112 H3 cenH3 CENPA 1058 ENSG00000115163 \n", "113 H3 NaN H3P26 10338 ENSG00000224447 \n", "114 H3 NaN H3P4 106479023 ENSG00000213244 \n", "115 H3 NaN H3P37 664611 ENSG00000270433 \n", "116 H3 NaN H3P38 654505 ENSG00000259389 \n", "117 H4 canonical H4 H4C1 8359 ENSG00000278637 \n", "118 H4 canonical H4 H4C2 8366 ENSG00000278705 \n", "119 H4 canonical H4 H4C3 8364 ENSG00000197061 \n", "120 H4 canonical H4 H4C4 8360 ENSG00000277157 \n", "121 H4 canonical H4 H4C5 8367 ENSG00000276966 \n", "122 H4 canonical H4 H4C6 8361 ENSG00000274618 \n", "123 H4 canonical H4 H4C7 8369 ENSG00000275663 \n", "124 H4 canonical H4 H4C8 8365 ENSG00000158406 \n", "125 H4 canonical H4 H4C9 8294 ENSG00000276180 \n", "126 H4 NaN H4C10P 10337 ENSG00000217862 \n", "127 H4 canonical H4 H4C11 8363 ENSG00000197238 \n", "128 H4 canonical H4 H4C12 8362 ENSG00000273542 \n", "129 H4 canonical H4 H4C13 8368 ENSG00000275126 \n", "130 H4 canonical H4 H4C14 8370 ENSG00000270882 \n", "131 H4 canonical H4 H4C15 554313 ENSG00000270276 \n", "132 H4 canonical H4 H4-16 121504 ENSG00000197837 \n", "\n", " Expr. timing Expr. pattern Biotype Bona fide canonical \\\n", "0 RI NaN COD NaN \n", "1 RD NaN COD NaN \n", "2 Mixed NaN COD NaN \n", "3 RD NaN COD NaN \n", "4 RD NaN COD NaN \n", "5 RD NaN COD NaN \n", "6 RD TS COD NaN \n", "7 RI TS COD NaN \n", "8 RI OO COD NaN \n", "9 RI TS COD NaN \n", "10 RI NaN COD NaN \n", "11 NaN NaN PS NaN \n", "12 Mixed TS COD NaN \n", "13 NaN NaN PS canonical \n", "14 NaN NaN PS NaN \n", "15 RD NaN COD canonical \n", "16 NaN NaN PS NaN \n", "17 RD NaN COD canonical \n", "18 RD NaN COD canonical \n", "19 RD NaN COD canonical \n", "20 NaN NaN PS NaN \n", "21 NaN NaN PS NaN \n", "22 RD NaN COD canonical \n", "23 RD NaN COD canonical \n", "24 RD NaN COD canonical \n", "25 RD NaN COD canonical \n", "26 RD NaN COD canonical \n", "27 RD NaN COD canonical \n", "28 RD NaN COD canonical \n", "29 RD NaN COD canonical \n", ".. ... ... ... ... \n", "103 RD NaN COD canonical \n", "104 RD NaN COD canonical \n", "105 RI NaN COD NaN \n", "106 RI NaN COD NaN \n", "107 RD NaN COD NaN \n", "108 RI NaN COD NaN \n", "109 RI NaN COD NaN \n", "110 RI TS COD NaN \n", "111 RI TS COD NaN \n", "112 RI NaN COD NaN \n", "113 NaN NaN PS NaN \n", "114 NaN NaN PS NaN \n", "115 NaN NaN PS NaN \n", "116 NaN NaN PS NaN \n", "117 RD NaN COD canonical \n", "118 RD NaN COD canonical \n", "119 RD NaN COD canonical \n", "120 RD NaN COD canonical \n", "121 RD NaN COD canonical \n", "122 RD NaN COD canonical \n", "123 RD NaN COD canonical \n", "124 RD NaN COD canonical \n", "125 RD NaN COD canonical \n", "126 NaN NaN PS NaN \n", "127 RD NaN COD canonical \n", "128 RD NaN COD canonical \n", "129 RD NaN COD canonical \n", "130 RD NaN COD canonical \n", "131 RD NaN COD canonical \n", "132 RD NaN COD canonical \n", "\n", " PMIDs \n", "0 26689747 \n", "1 26689747 \n", "2 26689747 \n", "3 26689747 \n", "4 26689747 \n", "5 26689747 \n", "6 26689747 \n", "7 26689747 \n", "8 26689747 \n", "9 12920187 26689747 17852044 \n", "10 26689747 \n", "11 NaN \n", "12 2011515 7068607 24506885 \n", "13 12408966 25731851 \n", "14 NaN \n", "15 12408966 25731851 \n", "16 NaN \n", "17 12408966 25731851 \n", "18 12408966 25731851 \n", "19 12408966 25731851 \n", "20 NaN \n", "21 NaN \n", "22 12408966 25731851 \n", "23 12408966 25731851 \n", "24 12408966 25731851 \n", "25 12408966 25731851 \n", "26 12408966 25731851 \n", "27 12408966 25731851 \n", "28 12408966 25731851 \n", "29 12408966 25731851 \n", ".. ... \n", "103 12408966 \n", "104 12408966 \n", "105 20819935 \n", "106 20819935 \n", "107 12408966 \n", "108 19412883 \n", "109 19412883 \n", "110 8986613 \n", "111 21274551 \n", "112 23324462 \n", "113 NaN \n", "114 NaN \n", "115 NaN \n", "116 NaN \n", "117 12408966 \n", "118 12408966 \n", "119 12408966 \n", "120 12408966 \n", "121 12408966 \n", "122 12408966 \n", "123 12408966 \n", "124 12408966 \n", "125 12408966 \n", "126 NaN \n", "127 12408966 \n", "128 12408966 \n", "129 12408966 \n", "130 12408966 \n", "131 12408966 \n", "132 12408966 \n", "\n", "[133 rows x 10 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "hist_genes=(d[d['Biotype'].isin(['COD'])])[['Histone type','Histone variant','HGNC symbol','NCBI gene ID','Ensembl gene ID']]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Histone typeHistone variantHGNC symbolNCBI gene IDEnsembl gene ID
0H1H1.0H1-03005ENSG00000189060
1H1H1.1H1-13024ENSG00000124610
2H1H1.2H1-23006ENSG00000187837
3H1H1.3H1-33007ENSG00000124575
4H1H1.4H1-43008ENSG00000168298
5H1H1.5H1-53009ENSG00000184357
6H1TS H1.6H1-63010ENSG00000187475
7H1TS H1.7H1-7341567ENSG00000187166
8H1OO H1.8H1-8132243ENSG00000178804
9H1TS H1.9(?)H1-9P373861ENSG00000188662
10H1H1.10H1-108971ENSG00000184897
12H2ATS H2A.1H2AC1221613ENSG00000164508
15H2Acanonical H2AH2AC48335ENSG00000278463
17H2Acanonical H2AH2AC68334ENSG00000180573
18H2Acanonical H2AH2AC73013ENSG00000196866
19H2Acanonical H2AH2AC83012ENSG00000277075
22H2Acanonical H2AH2AC118969ENSG00000196787
23H2Acanonical H2AH2AC1285235ENSG00000274997
24H2Acanonical H2AH2AC138329ENSG00000196747
25H2Acanonical H2AH2AC148331ENSG00000276368
26H2Acanonical H2AH2AC158330ENSG00000275221
27H2Acanonical H2AH2AC168332ENSG00000276903
28H2Acanonical H2AH2AC178336ENSG00000278677
29H2Acanonical H2AH2AC188337ENSG00000203812
30H2Acanonical H2AH2AC19723790ENSG00000272196
31H2Acanonical H2AH2AC208338ENSG00000184260
32H2Acanonical H2AH2AC21317772ENSG00000184270
33H2AH2A.J(?)H2AJ55766ENSG00000246705
34H2Acanonical H2AH2AW92815ENSG00000181218
35H2AH2A.XH2AX3014ENSG00000188486
..................
97H3canonical H3.1H3C88355ENSG00000273983
99H3canonical H3.1H3C108357ENSG00000278828
100H3canonical H3.1H3C118354ENSG00000275379
101H3canonical H3.1H3C128356ENSG00000197153
102H3canonical H3.1H3C13653604ENSG00000183598
103H3canonical H3.2H3C14126961ENSG00000203811
104H3canonical H3.1H3C15333932ENSG00000203852
105H3H3.Y.1H3Y1391769ENSG00000269466
106H3H3.Y.2H3Y2340096ENSG00000268799
107H3canonical H3(?)H3-2440686ENSG00000273213
108H3H3.3H3-3A3020ENSG00000163041
109H3H3.3H3-3B3021ENSG00000132475
110H3TS H3.4H3-48290ENSG00000168148
111H3H3.5H3-5440093ENSG00000188375
112H3cenH3CENPA1058ENSG00000115163
117H4canonical H4H4C18359ENSG00000278637
118H4canonical H4H4C28366ENSG00000278705
119H4canonical H4H4C38364ENSG00000197061
120H4canonical H4H4C48360ENSG00000277157
121H4canonical H4H4C58367ENSG00000276966
122H4canonical H4H4C68361ENSG00000274618
123H4canonical H4H4C78369ENSG00000275663
124H4canonical H4H4C88365ENSG00000158406
125H4canonical H4H4C98294ENSG00000276180
127H4canonical H4H4C118363ENSG00000197238
128H4canonical H4H4C128362ENSG00000273542
129H4canonical H4H4C138368ENSG00000275126
130H4canonical H4H4C148370ENSG00000270882
131H4canonical H4H4C15554313ENSG00000270276
132H4canonical H4H4-16121504ENSG00000197837
\n", "

96 rows × 5 columns

\n", "
" ], "text/plain": [ " Histone type Histone variant HGNC symbol NCBI gene ID Ensembl gene ID\n", "0 H1 H1.0 H1-0 3005 ENSG00000189060\n", "1 H1 H1.1 H1-1 3024 ENSG00000124610\n", "2 H1 H1.2 H1-2 3006 ENSG00000187837\n", "3 H1 H1.3 H1-3 3007 ENSG00000124575\n", "4 H1 H1.4 H1-4 3008 ENSG00000168298\n", "5 H1 H1.5 H1-5 3009 ENSG00000184357\n", "6 H1 TS H1.6 H1-6 3010 ENSG00000187475\n", "7 H1 TS H1.7 H1-7 341567 ENSG00000187166\n", "8 H1 OO H1.8 H1-8 132243 ENSG00000178804\n", "9 H1 TS H1.9(?) H1-9P 373861 ENSG00000188662\n", "10 H1 H1.10 H1-10 8971 ENSG00000184897\n", "12 H2A TS H2A.1 H2AC1 221613 ENSG00000164508\n", "15 H2A canonical H2A H2AC4 8335 ENSG00000278463\n", "17 H2A canonical H2A H2AC6 8334 ENSG00000180573\n", "18 H2A canonical H2A H2AC7 3013 ENSG00000196866\n", "19 H2A canonical H2A H2AC8 3012 ENSG00000277075\n", "22 H2A canonical H2A H2AC11 8969 ENSG00000196787\n", "23 H2A canonical H2A H2AC12 85235 ENSG00000274997\n", "24 H2A canonical H2A H2AC13 8329 ENSG00000196747\n", "25 H2A canonical H2A H2AC14 8331 ENSG00000276368\n", "26 H2A canonical H2A H2AC15 8330 ENSG00000275221\n", "27 H2A canonical H2A H2AC16 8332 ENSG00000276903\n", "28 H2A canonical H2A H2AC17 8336 ENSG00000278677\n", "29 H2A canonical H2A H2AC18 8337 ENSG00000203812\n", "30 H2A canonical H2A H2AC19 723790 ENSG00000272196\n", "31 H2A canonical H2A H2AC20 8338 ENSG00000184260\n", "32 H2A canonical H2A H2AC21 317772 ENSG00000184270\n", "33 H2A H2A.J(?) H2AJ 55766 ENSG00000246705\n", "34 H2A canonical H2A H2AW 92815 ENSG00000181218\n", "35 H2A H2A.X H2AX 3014 ENSG00000188486\n", ".. ... ... ... ... ...\n", "97 H3 canonical H3.1 H3C8 8355 ENSG00000273983\n", "99 H3 canonical H3.1 H3C10 8357 ENSG00000278828\n", "100 H3 canonical H3.1 H3C11 8354 ENSG00000275379\n", "101 H3 canonical H3.1 H3C12 8356 ENSG00000197153\n", "102 H3 canonical H3.1 H3C13 653604 ENSG00000183598\n", "103 H3 canonical H3.2 H3C14 126961 ENSG00000203811\n", "104 H3 canonical H3.1 H3C15 333932 ENSG00000203852\n", "105 H3 H3.Y.1 H3Y1 391769 ENSG00000269466\n", "106 H3 H3.Y.2 H3Y2 340096 ENSG00000268799\n", "107 H3 canonical H3(?) H3-2 440686 ENSG00000273213\n", "108 H3 H3.3 H3-3A 3020 ENSG00000163041\n", "109 H3 H3.3 H3-3B 3021 ENSG00000132475\n", "110 H3 TS H3.4 H3-4 8290 ENSG00000168148\n", "111 H3 H3.5 H3-5 440093 ENSG00000188375\n", "112 H3 cenH3 CENPA 1058 ENSG00000115163\n", "117 H4 canonical H4 H4C1 8359 ENSG00000278637\n", "118 H4 canonical H4 H4C2 8366 ENSG00000278705\n", "119 H4 canonical H4 H4C3 8364 ENSG00000197061\n", "120 H4 canonical H4 H4C4 8360 ENSG00000277157\n", "121 H4 canonical H4 H4C5 8367 ENSG00000276966\n", "122 H4 canonical H4 H4C6 8361 ENSG00000274618\n", "123 H4 canonical H4 H4C7 8369 ENSG00000275663\n", "124 H4 canonical H4 H4C8 8365 ENSG00000158406\n", "125 H4 canonical H4 H4C9 8294 ENSG00000276180\n", "127 H4 canonical H4 H4C11 8363 ENSG00000197238\n", "128 H4 canonical H4 H4C12 8362 ENSG00000273542\n", "129 H4 canonical H4 H4C13 8368 ENSG00000275126\n", "130 H4 canonical H4 H4C14 8370 ENSG00000270882\n", "131 H4 canonical H4 H4C15 554313 ENSG00000270276\n", "132 H4 canonical H4 H4-16 121504 ENSG00000197837\n", "\n", "[96 rows x 5 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hist_genes" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from pybiomart import Dataset" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "dataset = Dataset(name='hsapiens_gene_ensembl',\n", " host='http://www.ensembl.org')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "genedata = dataset.query(attributes=['ensembl_gene_id','ensembl_transcript_id','ensembl_peptide_id','refseq_mrna','refseq_peptide','transcript_biotype'], only_unique=False)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "histone_proteins=hist_genes.merge(genedata,left_on='Ensembl gene ID',right_on='Gene stable ID',suffixes=('','_y'),how='left').drop(columns='Gene stable ID')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Histone typeHistone variantHGNC symbolNCBI gene IDEnsembl gene IDTranscript stable IDProtein stable IDRefSeq mRNA IDRefSeq peptide IDTranscript type
0H1H1.0H1-03005ENSG00000189060ENST00000340857ENSP00000344504NM_005318NP_005309protein_coding
1H1H1.1H1-13024ENSG00000124610ENST00000244573ENSP00000244573NM_005325NP_005316protein_coding
2H1H1.2H1-23006ENSG00000187837ENST00000343677ENSP00000339566NM_005319NP_005310protein_coding
3H1H1.3H1-33007ENSG00000124575ENST00000244534ENSP00000244534NM_005320NP_005311protein_coding
4H1H1.4H1-43008ENSG00000168298ENST00000304218ENSP00000307705NM_005321NP_005312protein_coding
5H1H1.5H1-53009ENSG00000184357ENST00000331442ENSP00000330074NM_005322NP_005313protein_coding
6H1TS H1.6H1-63010ENSG00000187475ENST00000338379ENSP00000341214NM_005323NP_005314protein_coding
7H1TS H1.7H1-7341567ENSG00000187166ENST00000335017ENSP00000334805NM_181788NP_861453protein_coding
8H1OO H1.8H1-8132243ENSG00000178804ENST00000324382ENSP00000319799NM_153833NP_722575protein_coding
9H1OO H1.8H1-8132243ENSG00000178804ENST00000503977ENSP00000422964NM_001308262NP_001295191protein_coding
10H1TS H1.9(?)H1-9P373861ENSG00000188662NaNNaNNaNNaNNaN
11H1H1.10H1-108971ENSG00000184897ENST00000333762ENSP00000329662NM_006026NP_006017protein_coding
12H2ATS H2A.1H2AC1221613ENSG00000164508ENST00000297012ENSP00000297012NM_170745NP_734466protein_coding
13H2Acanonical H2AH2AC48335ENSG00000278463ENST00000615868ENSP00000483842NM_003513NP_003504protein_coding
14H2Acanonical H2AH2AC68334ENSG00000180573ENST00000314088ENSP00000321389NaNNaNnonsense_mediated_decay
15H2Acanonical H2AH2AC68334ENSG00000180573ENST00000602637ENSP00000473534NaNNaNprotein_coding
16H2Acanonical H2AH2AC68334ENSG00000180573ENST00000377791ENSP00000367022NM_003512NP_003503protein_coding
17H2Acanonical H2AH2AC73013ENSG00000196866ENST00000341023ENSP00000341094NM_021065NP_066409protein_coding
18H2Acanonical H2AH2AC83012ENSG00000277075ENST00000303910ENSP00000303373NM_021052NP_066390protein_coding
19H2Acanonical H2AH2AC118969ENSG00000196787ENST00000359193ENSP00000352119NM_021064NP_066408protein_coding
20H2Acanonical H2AH2AC1285235ENSG00000274997ENST00000377459ENSP00000366679NM_080596NP_542163protein_coding
21H2Acanonical H2AH2AC138329ENSG00000196747ENST00000358739ENSP00000351589NM_003509NP_003500protein_coding
22H2Acanonical H2AH2AC148331ENSG00000276368ENST00000333151ENSP00000328484NM_021066NP_066544protein_coding
23H2Acanonical H2AH2AC158330ENSG00000275221ENST00000618958ENSP00000482431NM_003510NP_003501protein_coding
24H2Acanonical H2AH2AC168332ENSG00000276903ENST00000613174ENSP00000482538NM_003511NP_003502protein_coding
25H2Acanonical H2AH2AC178336ENSG00000278677ENST00000359611ENSP00000352627NM_003514NP_003505protein_coding
26H2Acanonical H2AH2AC188337ENSG00000203812ENST00000369159ENSP00000358155NM_003516NP_003507protein_coding
27H2Acanonical H2AH2AC19723790ENSG00000272196ENST00000607355ENSP00000475814NM_001040874NP_001035807protein_coding
28H2Acanonical H2AH2AC208338ENSG00000184260ENST00000331380ENSP00000332194NM_003517NP_003508protein_coding
29H2Acanonical H2AH2AC21317772ENSG00000184270ENST00000331128ENSP00000332790NM_175065NP_778235protein_coding
.................................
132H3H3.3H3-3B3021ENSG00000132475ENST00000587171ENSP00000468484NaNNaNprotein_coding
133H3TS H3.4H3-48290ENSG00000168148ENST00000366696ENSP00000355657NM_003493NP_003484protein_coding
134H3H3.5H3-5440093ENSG00000188375ENST00000340398ENSP00000339835NM_001013699NP_001013721protein_coding
135H3cenH3CENPA1058ENSG00000115163ENST00000335756ENSP00000336868NM_001809NP_001800protein_coding
136H3cenH3CENPA1058ENSG00000115163ENST00000233505ENSP00000233505NM_001042426NP_001035891protein_coding
137H3cenH3CENPA1058ENSG00000115163ENST00000419525ENSP00000404963NaNNaNnonsense_mediated_decay
138H4canonical H4H4C18359ENSG00000278637ENST00000617569ENSP00000479106NM_003538NP_003529protein_coding
139H4canonical H4H4C28366ENSG00000278705ENST00000377745ENSP00000366974NM_003544NP_003535protein_coding
140H4canonical H4H4C38364ENSG00000197061ENST00000377803ENSP00000367034NM_003542NP_003533protein_coding
141H4canonical H4H4C48360ENSG00000277157ENST00000614247ENSP00000479461NM_003539NP_003530protein_coding
142H4canonical H4H4C58367ENSG00000276966ENST00000615164ENSP00000484789NM_003545NP_003536protein_coding
143H4canonical H4H4C68361ENSG00000274618ENST00000244537ENSP00000244537NM_003540NP_003531protein_coding
144H4canonical H4H4C78369ENSG00000275663ENST00000611444ENSP00000477870NM_003547NP_003538protein_coding
145H4canonical H4H4C88365ENSG00000158406ENST00000634956ENSP00000489567NaNNaNnonsense_mediated_decay
146H4canonical H4H4C88365ENSG00000158406ENST00000634560ENSP00000489319NaNNaNnonsense_mediated_decay
147H4canonical H4H4C88365ENSG00000158406ENST00000377727ENSP00000366956NM_003543NP_003534protein_coding
148H4canonical H4H4C88365ENSG00000158406ENST00000635491ENSP00000489236NaNNaNprotein_coding
149H4canonical H4H4C98294ENSG00000276180ENST00000615353ENSP00000481486NM_003495NP_003486protein_coding
150H4canonical H4H4C118363ENSG00000197238ENST00000355057ENSP00000347168NM_021968NP_068803protein_coding
151H4canonical H4H4C128362ENSG00000273542ENST00000611927ENSP00000479794NM_003541NP_003532protein_coding
152H4canonical H4H4C138368ENSG00000275126ENST00000618305ENSP00000480960NM_003546NP_003537protein_coding
153H4canonical H4H4C148370ENSG00000270882ENST00000578186ENSP00000462667NM_003548NP_003539protein_coding
154H4canonical H4H4C148370ENSG00000270882ENST00000618193ENSP00000478786NaNNaNnonsense_mediated_decay
155H4canonical H4H4C148370ENSG00000270882ENST00000614272ENSP00000478519NaNNaNnonsense_mediated_decay
156H4canonical H4H4C148370ENSG00000270882ENST00000613412ENSP00000481343NaNNaNnonsense_mediated_decay
157H4canonical H4H4C15554313ENSG00000270276ENST00000621520ENSP00000481507NaNNaNnonsense_mediated_decay
158H4canonical H4H4C15554313ENSG00000270276ENST00000612061ENSP00000482412NaNNaNnonsense_mediated_decay
159H4canonical H4H4C15554313ENSG00000270276ENST00000579512ENSP00000462355NM_001034077NP_001029249protein_coding
160H4canonical H4H4-16121504ENSG00000197837ENST00000358064ENSP00000350767NaNNaNnonsense_mediated_decay
161H4canonical H4H4-16121504ENSG00000197837ENST00000539745ENSP00000443017NM_175054NP_778224protein_coding
\n", "

162 rows × 10 columns

\n", "
" ], "text/plain": [ " Histone type Histone variant HGNC symbol NCBI gene ID Ensembl gene ID \\\n", "0 H1 H1.0 H1-0 3005 ENSG00000189060 \n", "1 H1 H1.1 H1-1 3024 ENSG00000124610 \n", "2 H1 H1.2 H1-2 3006 ENSG00000187837 \n", "3 H1 H1.3 H1-3 3007 ENSG00000124575 \n", "4 H1 H1.4 H1-4 3008 ENSG00000168298 \n", "5 H1 H1.5 H1-5 3009 ENSG00000184357 \n", "6 H1 TS H1.6 H1-6 3010 ENSG00000187475 \n", "7 H1 TS H1.7 H1-7 341567 ENSG00000187166 \n", "8 H1 OO H1.8 H1-8 132243 ENSG00000178804 \n", "9 H1 OO H1.8 H1-8 132243 ENSG00000178804 \n", "10 H1 TS H1.9(?) H1-9P 373861 ENSG00000188662 \n", "11 H1 H1.10 H1-10 8971 ENSG00000184897 \n", "12 H2A TS H2A.1 H2AC1 221613 ENSG00000164508 \n", "13 H2A canonical H2A H2AC4 8335 ENSG00000278463 \n", "14 H2A canonical H2A H2AC6 8334 ENSG00000180573 \n", "15 H2A canonical H2A H2AC6 8334 ENSG00000180573 \n", "16 H2A canonical H2A H2AC6 8334 ENSG00000180573 \n", "17 H2A canonical H2A H2AC7 3013 ENSG00000196866 \n", "18 H2A canonical H2A H2AC8 3012 ENSG00000277075 \n", "19 H2A canonical H2A H2AC11 8969 ENSG00000196787 \n", "20 H2A canonical H2A H2AC12 85235 ENSG00000274997 \n", "21 H2A canonical H2A H2AC13 8329 ENSG00000196747 \n", "22 H2A canonical H2A H2AC14 8331 ENSG00000276368 \n", "23 H2A canonical H2A H2AC15 8330 ENSG00000275221 \n", "24 H2A canonical H2A H2AC16 8332 ENSG00000276903 \n", "25 H2A canonical H2A H2AC17 8336 ENSG00000278677 \n", "26 H2A canonical H2A H2AC18 8337 ENSG00000203812 \n", "27 H2A canonical H2A H2AC19 723790 ENSG00000272196 \n", "28 H2A canonical H2A H2AC20 8338 ENSG00000184260 \n", "29 H2A canonical H2A H2AC21 317772 ENSG00000184270 \n", ".. ... ... ... ... ... \n", "132 H3 H3.3 H3-3B 3021 ENSG00000132475 \n", "133 H3 TS H3.4 H3-4 8290 ENSG00000168148 \n", "134 H3 H3.5 H3-5 440093 ENSG00000188375 \n", "135 H3 cenH3 CENPA 1058 ENSG00000115163 \n", "136 H3 cenH3 CENPA 1058 ENSG00000115163 \n", "137 H3 cenH3 CENPA 1058 ENSG00000115163 \n", "138 H4 canonical H4 H4C1 8359 ENSG00000278637 \n", "139 H4 canonical H4 H4C2 8366 ENSG00000278705 \n", "140 H4 canonical H4 H4C3 8364 ENSG00000197061 \n", "141 H4 canonical H4 H4C4 8360 ENSG00000277157 \n", "142 H4 canonical H4 H4C5 8367 ENSG00000276966 \n", "143 H4 canonical H4 H4C6 8361 ENSG00000274618 \n", "144 H4 canonical H4 H4C7 8369 ENSG00000275663 \n", "145 H4 canonical H4 H4C8 8365 ENSG00000158406 \n", "146 H4 canonical H4 H4C8 8365 ENSG00000158406 \n", "147 H4 canonical H4 H4C8 8365 ENSG00000158406 \n", "148 H4 canonical H4 H4C8 8365 ENSG00000158406 \n", "149 H4 canonical H4 H4C9 8294 ENSG00000276180 \n", "150 H4 canonical H4 H4C11 8363 ENSG00000197238 \n", "151 H4 canonical H4 H4C12 8362 ENSG00000273542 \n", "152 H4 canonical H4 H4C13 8368 ENSG00000275126 \n", "153 H4 canonical H4 H4C14 8370 ENSG00000270882 \n", "154 H4 canonical H4 H4C14 8370 ENSG00000270882 \n", "155 H4 canonical H4 H4C14 8370 ENSG00000270882 \n", "156 H4 canonical H4 H4C14 8370 ENSG00000270882 \n", "157 H4 canonical H4 H4C15 554313 ENSG00000270276 \n", "158 H4 canonical H4 H4C15 554313 ENSG00000270276 \n", "159 H4 canonical H4 H4C15 554313 ENSG00000270276 \n", "160 H4 canonical H4 H4-16 121504 ENSG00000197837 \n", "161 H4 canonical H4 H4-16 121504 ENSG00000197837 \n", "\n", " Transcript stable ID Protein stable ID RefSeq mRNA ID RefSeq peptide ID \\\n", "0 ENST00000340857 ENSP00000344504 NM_005318 NP_005309 \n", "1 ENST00000244573 ENSP00000244573 NM_005325 NP_005316 \n", "2 ENST00000343677 ENSP00000339566 NM_005319 NP_005310 \n", "3 ENST00000244534 ENSP00000244534 NM_005320 NP_005311 \n", "4 ENST00000304218 ENSP00000307705 NM_005321 NP_005312 \n", "5 ENST00000331442 ENSP00000330074 NM_005322 NP_005313 \n", "6 ENST00000338379 ENSP00000341214 NM_005323 NP_005314 \n", "7 ENST00000335017 ENSP00000334805 NM_181788 NP_861453 \n", "8 ENST00000324382 ENSP00000319799 NM_153833 NP_722575 \n", "9 ENST00000503977 ENSP00000422964 NM_001308262 NP_001295191 \n", "10 NaN NaN NaN NaN \n", "11 ENST00000333762 ENSP00000329662 NM_006026 NP_006017 \n", "12 ENST00000297012 ENSP00000297012 NM_170745 NP_734466 \n", "13 ENST00000615868 ENSP00000483842 NM_003513 NP_003504 \n", "14 ENST00000314088 ENSP00000321389 NaN NaN \n", "15 ENST00000602637 ENSP00000473534 NaN NaN \n", "16 ENST00000377791 ENSP00000367022 NM_003512 NP_003503 \n", "17 ENST00000341023 ENSP00000341094 NM_021065 NP_066409 \n", "18 ENST00000303910 ENSP00000303373 NM_021052 NP_066390 \n", "19 ENST00000359193 ENSP00000352119 NM_021064 NP_066408 \n", "20 ENST00000377459 ENSP00000366679 NM_080596 NP_542163 \n", "21 ENST00000358739 ENSP00000351589 NM_003509 NP_003500 \n", "22 ENST00000333151 ENSP00000328484 NM_021066 NP_066544 \n", "23 ENST00000618958 ENSP00000482431 NM_003510 NP_003501 \n", "24 ENST00000613174 ENSP00000482538 NM_003511 NP_003502 \n", "25 ENST00000359611 ENSP00000352627 NM_003514 NP_003505 \n", "26 ENST00000369159 ENSP00000358155 NM_003516 NP_003507 \n", "27 ENST00000607355 ENSP00000475814 NM_001040874 NP_001035807 \n", "28 ENST00000331380 ENSP00000332194 NM_003517 NP_003508 \n", "29 ENST00000331128 ENSP00000332790 NM_175065 NP_778235 \n", ".. ... ... ... ... \n", "132 ENST00000587171 ENSP00000468484 NaN NaN \n", "133 ENST00000366696 ENSP00000355657 NM_003493 NP_003484 \n", "134 ENST00000340398 ENSP00000339835 NM_001013699 NP_001013721 \n", "135 ENST00000335756 ENSP00000336868 NM_001809 NP_001800 \n", "136 ENST00000233505 ENSP00000233505 NM_001042426 NP_001035891 \n", "137 ENST00000419525 ENSP00000404963 NaN NaN \n", "138 ENST00000617569 ENSP00000479106 NM_003538 NP_003529 \n", "139 ENST00000377745 ENSP00000366974 NM_003544 NP_003535 \n", "140 ENST00000377803 ENSP00000367034 NM_003542 NP_003533 \n", "141 ENST00000614247 ENSP00000479461 NM_003539 NP_003530 \n", "142 ENST00000615164 ENSP00000484789 NM_003545 NP_003536 \n", "143 ENST00000244537 ENSP00000244537 NM_003540 NP_003531 \n", "144 ENST00000611444 ENSP00000477870 NM_003547 NP_003538 \n", "145 ENST00000634956 ENSP00000489567 NaN NaN \n", "146 ENST00000634560 ENSP00000489319 NaN NaN \n", "147 ENST00000377727 ENSP00000366956 NM_003543 NP_003534 \n", "148 ENST00000635491 ENSP00000489236 NaN NaN \n", "149 ENST00000615353 ENSP00000481486 NM_003495 NP_003486 \n", "150 ENST00000355057 ENSP00000347168 NM_021968 NP_068803 \n", "151 ENST00000611927 ENSP00000479794 NM_003541 NP_003532 \n", "152 ENST00000618305 ENSP00000480960 NM_003546 NP_003537 \n", "153 ENST00000578186 ENSP00000462667 NM_003548 NP_003539 \n", "154 ENST00000618193 ENSP00000478786 NaN NaN \n", "155 ENST00000614272 ENSP00000478519 NaN NaN \n", "156 ENST00000613412 ENSP00000481343 NaN NaN \n", "157 ENST00000621520 ENSP00000481507 NaN NaN \n", "158 ENST00000612061 ENSP00000482412 NaN NaN \n", "159 ENST00000579512 ENSP00000462355 NM_001034077 NP_001029249 \n", "160 ENST00000358064 ENSP00000350767 NaN NaN \n", "161 ENST00000539745 ENSP00000443017 NM_175054 NP_778224 \n", "\n", " Transcript type \n", "0 protein_coding \n", "1 protein_coding \n", "2 protein_coding \n", "3 protein_coding \n", "4 protein_coding \n", "5 protein_coding \n", "6 protein_coding \n", "7 protein_coding \n", "8 protein_coding \n", "9 protein_coding \n", "10 NaN \n", "11 protein_coding \n", "12 protein_coding \n", "13 protein_coding \n", "14 nonsense_mediated_decay \n", "15 protein_coding \n", "16 protein_coding \n", "17 protein_coding \n", "18 protein_coding \n", "19 protein_coding \n", "20 protein_coding \n", "21 protein_coding \n", "22 protein_coding \n", "23 protein_coding \n", "24 protein_coding \n", "25 protein_coding \n", "26 protein_coding \n", "27 protein_coding \n", "28 protein_coding \n", "29 protein_coding \n", ".. ... \n", "132 protein_coding \n", "133 protein_coding \n", "134 protein_coding \n", "135 protein_coding \n", "136 protein_coding \n", "137 nonsense_mediated_decay \n", "138 protein_coding \n", "139 protein_coding \n", "140 protein_coding \n", "141 protein_coding \n", "142 protein_coding \n", "143 protein_coding \n", "144 protein_coding \n", "145 nonsense_mediated_decay \n", "146 nonsense_mediated_decay \n", "147 protein_coding \n", "148 protein_coding \n", "149 protein_coding \n", "150 protein_coding \n", "151 protein_coding \n", "152 protein_coding \n", "153 protein_coding \n", "154 nonsense_mediated_decay \n", "155 nonsense_mediated_decay \n", "156 nonsense_mediated_decay \n", "157 nonsense_mediated_decay \n", "158 nonsense_mediated_decay \n", "159 protein_coding \n", "160 nonsense_mediated_decay \n", "161 protein_coding \n", "\n", "[162 rows x 10 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "histone_proteins" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import requests\n", "seqs=[]\n", "seqlen=[]\n", "refs=[]\n", "for index, row in histone_proteins.iterrows():\n", "# print(row['Protein stable ID'])\n", " seq=requests.get('http://rest.ensembl.org/sequence/id/%s?content-type=text/plain'%row['Protein stable ID']).content\n", " seqs.append(seq)\n", " seqlen.append(len(seq))\n", " refs.append(d.loc[d['NCBI gene ID']==row['NCBI gene ID']]['PMIDs'].values[0])\n", "\n", "\n", "# refs.append(hist_genes.loc(hist_genes['NCBI gene ID']==row['NCBI gene ID'])values[0])\n", "histone_proteins['Protein sequence']=seqs\n", "histone_proteins['Protein length']=seqlen\n", "histone_proteins['References']=refs\n", "\n", "\n", "#response = requests.get('http://rest.ensembl.org/sequence/id/ENST00000340857?content-type=text/plain')\n", "#print (response.status_code)\n", "#print (response.content)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Histone typeHistone variantHGNC symbolNCBI gene IDEnsembl gene IDTranscript stable IDProtein stable IDRefSeq mRNA IDRefSeq peptide IDTranscript typeProtein sequenceProtein lengthReferences
0H1H1.0H1-03005ENSG00000189060ENST00000340857ENSP00000344504NM_005318NP_005309protein_codingb'MTENSTSAPAAKPKRAKASKKSTDHPKYSDMIVAAIQAEKNRAG...19426689747
1H1H1.1H1-13024ENSG00000124610ENST00000244573ENSP00000244573NM_005325NP_005316protein_codingb'MSETVPPAPAASAAPEKPLAGKKAKKPAKAAAASKKKPAGPSVS...21526689747
2H1H1.2H1-23006ENSG00000187837ENST00000343677ENSP00000339566NM_005319NP_005310protein_codingb'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELI...21326689747
3H1H1.3H1-33007ENSG00000124575ENST00000244534ENSP00000244534NM_005320NP_005311protein_codingb'MSETAPLAPTIPAPAEKTPVKKKAKKAGATAGKRKASGPPVSEL...22126689747
4H1H1.4H1-43008ENSG00000168298ENST00000304218ENSP00000307705NM_005321NP_005312protein_codingb'MSETAPAAPAAPAPAEKTPVKKKARKSAGAAKRKASGPPVSELI...21926689747
5H1H1.5H1-53009ENSG00000184357ENST00000331442ENSP00000330074NM_005322NP_005313protein_codingb'MSETAPAETATPAPVEKSPAKKKATKKAAGAGAAKRKATGPPVS...22626689747
6H1TS H1.6H1-63010ENSG00000187475ENST00000338379ENSP00000341214NM_005323NP_005314protein_codingb'MSETVPAASASAGVAAMEKLPTKKRGRKPAGLISASRKVPNLSV...20726689747
7H1TS H1.7H1-7341567ENSG00000187166ENST00000335017ENSP00000334805NM_181788NP_861453protein_codingb'MEQALTGEAQSRWPRRGGSGAMAEAPGPSGESRGHSATQLPAEK...25526689747
8H1OO H1.8H1-8132243ENSG00000178804ENST00000324382ENSP00000319799NM_153833NP_722575protein_codingb'MAPGSVTSDISPSSTSTAGSSRSPESEKPGPSHGGVPPGGPSHS...34626689747
9H1OO H1.8H1-8132243ENSG00000178804ENST00000503977ENSP00000422964NM_001308262NP_001295191protein_codingb'MAPATAPRRAGEAKGKGPKKPSEAKEDPPNVGKVKKAAKRPAKV...20726689747
10H1TS H1.9(?)H1-9P373861ENSG00000188662NaNNaNNaNNaNNaNb'{\"error\":\"ID \\'nan\\' not found\"}'3012920187 26689747 17852044
11H1H1.10H1-108971ENSG00000184897ENST00000333762ENSP00000329662NM_006026NP_006017protein_codingb'MSVELEEALPVTTAEGMAKKVTKAGGSAALSPSKKRKNSKKKNQ...21326689747
12H2ATS H2A.1H2AC1221613ENSG00000164508ENST00000297012ENSP00000297012NM_170745NP_734466protein_codingb'MSGRGKQGGKARAKSKSRSSRAGLQFPVGRIHRLLRKGNYAERI...1312011515 7068607 24506885
13H2Acanonical H2AH2AC48335ENSG00000278463ENST00000615868ENSP00000483842NM_003513NP_003504protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV...13012408966 25731851
14H2Acanonical H2AH2AC68334ENSG00000180573ENST00000314088ENSP00000321389NaNNaNnonsense_mediated_decayb'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
15H2Acanonical H2AH2AC68334ENSG00000180573ENST00000602637ENSP00000473534NaNNaNprotein_codingb'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
16H2Acanonical H2AH2AC68334ENSG00000180573ENST00000377791ENSP00000367022NM_003512NP_003503protein_codingb'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
17H2Acanonical H2AH2AC73013ENSG00000196866ENST00000341023ENSP00000341094NM_021065NP_066409protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV...13012408966 25731851
18H2Acanonical H2AH2AC83012ENSG00000277075ENST00000303910ENSP00000303373NM_021052NP_066390protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV...13012408966 25731851
19H2Acanonical H2AH2AC118969ENSG00000196787ENST00000359193ENSP00000352119NM_021064NP_066408protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
20H2Acanonical H2AH2AC1285235ENSG00000274997ENST00000377459ENSP00000366679NM_080596NP_542163protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV...12812408966 25731851
21H2Acanonical H2AH2AC138329ENSG00000196747ENST00000358739ENSP00000351589NM_003509NP_003500protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
22H2Acanonical H2AH2AC148331ENSG00000276368ENST00000333151ENSP00000328484NM_021066NP_066544protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV...12812408966 25731851
23H2Acanonical H2AH2AC158330ENSG00000275221ENST00000618958ENSP00000482431NM_003510NP_003501protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
24H2Acanonical H2AH2AC168332ENSG00000276903ENST00000613174ENSP00000482538NM_003511NP_003502protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
25H2Acanonical H2AH2AC178336ENSG00000278677ENST00000359611ENSP00000352627NM_003514NP_003505protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
26H2Acanonical H2AH2AC188337ENSG00000203812ENST00000369159ENSP00000358155NM_003516NP_003507protein_codingb'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
27H2Acanonical H2AH2AC19723790ENSG00000272196ENST00000607355ENSP00000475814NM_001040874NP_001035807protein_codingb'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
28H2Acanonical H2AH2AC208338ENSG00000184260ENST00000331380ENSP00000332194NM_003517NP_003508protein_codingb'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV...12912408966 25731851
29H2Acanonical H2AH2AC21317772ENSG00000184270ENST00000331128ENSP00000332790NM_175065NP_778235protein_codingb'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
..........................................
132H3H3.3H3-3B3021ENSG00000132475ENST00000587171ENSP00000468484NaNNaNprotein_codingb'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP...15119412883
133H3TS H3.4H3-48290ENSG00000168148ENST00000366696ENSP00000355657NM_003493NP_003484protein_codingb'MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYRP...1368986613
134H3H3.5H3-5440093ENSG00000188375ENST00000340398ENSP00000339835NM_001013699NP_001013721protein_codingb'MARTKQTARKSTGGKAPRKQLATKAARKSTPSTCGVKPHRYRPG...13521274551
135H3cenH3CENPA1058ENSG00000115163ENST00000335756ENSP00000336868NM_001809NP_001800protein_codingb'MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRR...14023324462
136H3cenH3CENPA1058ENSG00000115163ENST00000233505ENSP00000233505NM_001042426NP_001035891protein_codingb'MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRR...11423324462
137H3cenH3CENPA1058ENSG00000115163ENST00000419525ENSP00000404963NaNNaNnonsense_mediated_decayb'MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRR...8623324462
138H4canonical H4H4C18359ENSG00000278637ENST00000617569ENSP00000479106NM_003538NP_003529protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
139H4canonical H4H4C28366ENSG00000278705ENST00000377745ENSP00000366974NM_003544NP_003535protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
140H4canonical H4H4C38364ENSG00000197061ENST00000377803ENSP00000367034NM_003542NP_003533protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
141H4canonical H4H4C48360ENSG00000277157ENST00000614247ENSP00000479461NM_003539NP_003530protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
142H4canonical H4H4C58367ENSG00000276966ENST00000615164ENSP00000484789NM_003545NP_003536protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
143H4canonical H4H4C68361ENSG00000274618ENST00000244537ENSP00000244537NM_003540NP_003531protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
144H4canonical H4H4C78369ENSG00000275663ENST00000611444ENSP00000477870NM_003547NP_003538protein_codingb'MSVRGKAGKGLGKGGAKCHRKVLSDNIQGITKCTIRRLARHGGV...9812408966
145H4canonical H4H4C88365ENSG00000158406ENST00000634956ENSP00000489567NaNNaNnonsense_mediated_decayb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
146H4canonical H4H4C88365ENSG00000158406ENST00000634560ENSP00000489319NaNNaNnonsense_mediated_decayb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
147H4canonical H4H4C88365ENSG00000158406ENST00000377727ENSP00000366956NM_003543NP_003534protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
148H4canonical H4H4C88365ENSG00000158406ENST00000635491ENSP00000489236NaNNaNprotein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
149H4canonical H4H4C98294ENSG00000276180ENST00000615353ENSP00000481486NM_003495NP_003486protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
150H4canonical H4H4C118363ENSG00000197238ENST00000355057ENSP00000347168NM_021968NP_068803protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
151H4canonical H4H4C128362ENSG00000273542ENST00000611927ENSP00000479794NM_003541NP_003532protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
152H4canonical H4H4C138368ENSG00000275126ENST00000618305ENSP00000480960NM_003546NP_003537protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
153H4canonical H4H4C148370ENSG00000270882ENST00000578186ENSP00000462667NM_003548NP_003539protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
154H4canonical H4H4C148370ENSG00000270882ENST00000618193ENSP00000478786NaNNaNnonsense_mediated_decayb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
155H4canonical H4H4C148370ENSG00000270882ENST00000614272ENSP00000478519NaNNaNnonsense_mediated_decayb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
156H4canonical H4H4C148370ENSG00000270882ENST00000613412ENSP00000481343NaNNaNnonsense_mediated_decayb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
157H4canonical H4H4C15554313ENSG00000270276ENST00000621520ENSP00000481507NaNNaNnonsense_mediated_decayb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
158H4canonical H4H4C15554313ENSG00000270276ENST00000612061ENSP00000482412NaNNaNnonsense_mediated_decayb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
159H4canonical H4H4C15554313ENSG00000270276ENST00000579512ENSP00000462355NM_001034077NP_001029249protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
160H4canonical H4H4-16121504ENSG00000197837ENST00000358064ENSP00000350767NaNNaNnonsense_mediated_decayb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
161H4canonical H4H4-16121504ENSG00000197837ENST00000539745ENSP00000443017NM_175054NP_778224protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
\n", "

162 rows × 13 columns

\n", "
" ], "text/plain": [ " Histone type Histone variant HGNC symbol NCBI gene ID Ensembl gene ID \\\n", "0 H1 H1.0 H1-0 3005 ENSG00000189060 \n", "1 H1 H1.1 H1-1 3024 ENSG00000124610 \n", "2 H1 H1.2 H1-2 3006 ENSG00000187837 \n", "3 H1 H1.3 H1-3 3007 ENSG00000124575 \n", "4 H1 H1.4 H1-4 3008 ENSG00000168298 \n", "5 H1 H1.5 H1-5 3009 ENSG00000184357 \n", "6 H1 TS H1.6 H1-6 3010 ENSG00000187475 \n", "7 H1 TS H1.7 H1-7 341567 ENSG00000187166 \n", "8 H1 OO H1.8 H1-8 132243 ENSG00000178804 \n", "9 H1 OO H1.8 H1-8 132243 ENSG00000178804 \n", "10 H1 TS H1.9(?) H1-9P 373861 ENSG00000188662 \n", "11 H1 H1.10 H1-10 8971 ENSG00000184897 \n", "12 H2A TS H2A.1 H2AC1 221613 ENSG00000164508 \n", "13 H2A canonical H2A H2AC4 8335 ENSG00000278463 \n", "14 H2A canonical H2A H2AC6 8334 ENSG00000180573 \n", "15 H2A canonical H2A H2AC6 8334 ENSG00000180573 \n", "16 H2A canonical H2A H2AC6 8334 ENSG00000180573 \n", "17 H2A canonical H2A H2AC7 3013 ENSG00000196866 \n", "18 H2A canonical H2A H2AC8 3012 ENSG00000277075 \n", "19 H2A canonical H2A H2AC11 8969 ENSG00000196787 \n", "20 H2A canonical H2A H2AC12 85235 ENSG00000274997 \n", "21 H2A canonical H2A H2AC13 8329 ENSG00000196747 \n", "22 H2A canonical H2A H2AC14 8331 ENSG00000276368 \n", "23 H2A canonical H2A H2AC15 8330 ENSG00000275221 \n", "24 H2A canonical H2A H2AC16 8332 ENSG00000276903 \n", "25 H2A canonical H2A H2AC17 8336 ENSG00000278677 \n", "26 H2A canonical H2A H2AC18 8337 ENSG00000203812 \n", "27 H2A canonical H2A H2AC19 723790 ENSG00000272196 \n", "28 H2A canonical H2A H2AC20 8338 ENSG00000184260 \n", "29 H2A canonical H2A H2AC21 317772 ENSG00000184270 \n", ".. ... ... ... ... ... \n", "132 H3 H3.3 H3-3B 3021 ENSG00000132475 \n", "133 H3 TS H3.4 H3-4 8290 ENSG00000168148 \n", "134 H3 H3.5 H3-5 440093 ENSG00000188375 \n", "135 H3 cenH3 CENPA 1058 ENSG00000115163 \n", "136 H3 cenH3 CENPA 1058 ENSG00000115163 \n", "137 H3 cenH3 CENPA 1058 ENSG00000115163 \n", "138 H4 canonical H4 H4C1 8359 ENSG00000278637 \n", "139 H4 canonical H4 H4C2 8366 ENSG00000278705 \n", "140 H4 canonical H4 H4C3 8364 ENSG00000197061 \n", "141 H4 canonical H4 H4C4 8360 ENSG00000277157 \n", "142 H4 canonical H4 H4C5 8367 ENSG00000276966 \n", "143 H4 canonical H4 H4C6 8361 ENSG00000274618 \n", "144 H4 canonical H4 H4C7 8369 ENSG00000275663 \n", "145 H4 canonical H4 H4C8 8365 ENSG00000158406 \n", "146 H4 canonical H4 H4C8 8365 ENSG00000158406 \n", "147 H4 canonical H4 H4C8 8365 ENSG00000158406 \n", "148 H4 canonical H4 H4C8 8365 ENSG00000158406 \n", "149 H4 canonical H4 H4C9 8294 ENSG00000276180 \n", "150 H4 canonical H4 H4C11 8363 ENSG00000197238 \n", "151 H4 canonical H4 H4C12 8362 ENSG00000273542 \n", "152 H4 canonical H4 H4C13 8368 ENSG00000275126 \n", "153 H4 canonical H4 H4C14 8370 ENSG00000270882 \n", "154 H4 canonical H4 H4C14 8370 ENSG00000270882 \n", "155 H4 canonical H4 H4C14 8370 ENSG00000270882 \n", "156 H4 canonical H4 H4C14 8370 ENSG00000270882 \n", "157 H4 canonical H4 H4C15 554313 ENSG00000270276 \n", "158 H4 canonical H4 H4C15 554313 ENSG00000270276 \n", "159 H4 canonical H4 H4C15 554313 ENSG00000270276 \n", "160 H4 canonical H4 H4-16 121504 ENSG00000197837 \n", "161 H4 canonical H4 H4-16 121504 ENSG00000197837 \n", "\n", " Transcript stable ID Protein stable ID RefSeq mRNA ID RefSeq peptide ID \\\n", "0 ENST00000340857 ENSP00000344504 NM_005318 NP_005309 \n", "1 ENST00000244573 ENSP00000244573 NM_005325 NP_005316 \n", "2 ENST00000343677 ENSP00000339566 NM_005319 NP_005310 \n", "3 ENST00000244534 ENSP00000244534 NM_005320 NP_005311 \n", "4 ENST00000304218 ENSP00000307705 NM_005321 NP_005312 \n", "5 ENST00000331442 ENSP00000330074 NM_005322 NP_005313 \n", "6 ENST00000338379 ENSP00000341214 NM_005323 NP_005314 \n", "7 ENST00000335017 ENSP00000334805 NM_181788 NP_861453 \n", "8 ENST00000324382 ENSP00000319799 NM_153833 NP_722575 \n", "9 ENST00000503977 ENSP00000422964 NM_001308262 NP_001295191 \n", "10 NaN NaN NaN NaN \n", "11 ENST00000333762 ENSP00000329662 NM_006026 NP_006017 \n", "12 ENST00000297012 ENSP00000297012 NM_170745 NP_734466 \n", "13 ENST00000615868 ENSP00000483842 NM_003513 NP_003504 \n", "14 ENST00000314088 ENSP00000321389 NaN NaN \n", "15 ENST00000602637 ENSP00000473534 NaN NaN \n", "16 ENST00000377791 ENSP00000367022 NM_003512 NP_003503 \n", "17 ENST00000341023 ENSP00000341094 NM_021065 NP_066409 \n", "18 ENST00000303910 ENSP00000303373 NM_021052 NP_066390 \n", "19 ENST00000359193 ENSP00000352119 NM_021064 NP_066408 \n", "20 ENST00000377459 ENSP00000366679 NM_080596 NP_542163 \n", "21 ENST00000358739 ENSP00000351589 NM_003509 NP_003500 \n", "22 ENST00000333151 ENSP00000328484 NM_021066 NP_066544 \n", "23 ENST00000618958 ENSP00000482431 NM_003510 NP_003501 \n", "24 ENST00000613174 ENSP00000482538 NM_003511 NP_003502 \n", "25 ENST00000359611 ENSP00000352627 NM_003514 NP_003505 \n", "26 ENST00000369159 ENSP00000358155 NM_003516 NP_003507 \n", "27 ENST00000607355 ENSP00000475814 NM_001040874 NP_001035807 \n", "28 ENST00000331380 ENSP00000332194 NM_003517 NP_003508 \n", "29 ENST00000331128 ENSP00000332790 NM_175065 NP_778235 \n", ".. ... ... ... ... \n", "132 ENST00000587171 ENSP00000468484 NaN NaN \n", "133 ENST00000366696 ENSP00000355657 NM_003493 NP_003484 \n", "134 ENST00000340398 ENSP00000339835 NM_001013699 NP_001013721 \n", "135 ENST00000335756 ENSP00000336868 NM_001809 NP_001800 \n", "136 ENST00000233505 ENSP00000233505 NM_001042426 NP_001035891 \n", "137 ENST00000419525 ENSP00000404963 NaN NaN \n", "138 ENST00000617569 ENSP00000479106 NM_003538 NP_003529 \n", "139 ENST00000377745 ENSP00000366974 NM_003544 NP_003535 \n", "140 ENST00000377803 ENSP00000367034 NM_003542 NP_003533 \n", "141 ENST00000614247 ENSP00000479461 NM_003539 NP_003530 \n", "142 ENST00000615164 ENSP00000484789 NM_003545 NP_003536 \n", "143 ENST00000244537 ENSP00000244537 NM_003540 NP_003531 \n", "144 ENST00000611444 ENSP00000477870 NM_003547 NP_003538 \n", "145 ENST00000634956 ENSP00000489567 NaN NaN \n", "146 ENST00000634560 ENSP00000489319 NaN NaN \n", "147 ENST00000377727 ENSP00000366956 NM_003543 NP_003534 \n", "148 ENST00000635491 ENSP00000489236 NaN NaN \n", "149 ENST00000615353 ENSP00000481486 NM_003495 NP_003486 \n", "150 ENST00000355057 ENSP00000347168 NM_021968 NP_068803 \n", "151 ENST00000611927 ENSP00000479794 NM_003541 NP_003532 \n", "152 ENST00000618305 ENSP00000480960 NM_003546 NP_003537 \n", "153 ENST00000578186 ENSP00000462667 NM_003548 NP_003539 \n", "154 ENST00000618193 ENSP00000478786 NaN NaN \n", "155 ENST00000614272 ENSP00000478519 NaN NaN \n", "156 ENST00000613412 ENSP00000481343 NaN NaN \n", "157 ENST00000621520 ENSP00000481507 NaN NaN \n", "158 ENST00000612061 ENSP00000482412 NaN NaN \n", "159 ENST00000579512 ENSP00000462355 NM_001034077 NP_001029249 \n", "160 ENST00000358064 ENSP00000350767 NaN NaN \n", "161 ENST00000539745 ENSP00000443017 NM_175054 NP_778224 \n", "\n", " Transcript type \\\n", "0 protein_coding \n", "1 protein_coding \n", "2 protein_coding \n", "3 protein_coding \n", "4 protein_coding \n", "5 protein_coding \n", "6 protein_coding \n", "7 protein_coding \n", "8 protein_coding \n", "9 protein_coding \n", "10 NaN \n", "11 protein_coding \n", "12 protein_coding \n", "13 protein_coding \n", "14 nonsense_mediated_decay \n", "15 protein_coding \n", "16 protein_coding \n", "17 protein_coding \n", "18 protein_coding \n", "19 protein_coding \n", "20 protein_coding \n", "21 protein_coding \n", "22 protein_coding \n", "23 protein_coding \n", "24 protein_coding \n", "25 protein_coding \n", "26 protein_coding \n", "27 protein_coding \n", "28 protein_coding \n", "29 protein_coding \n", ".. ... \n", "132 protein_coding \n", "133 protein_coding \n", "134 protein_coding \n", "135 protein_coding \n", "136 protein_coding \n", "137 nonsense_mediated_decay \n", "138 protein_coding \n", "139 protein_coding \n", "140 protein_coding \n", "141 protein_coding \n", "142 protein_coding \n", "143 protein_coding \n", "144 protein_coding \n", "145 nonsense_mediated_decay \n", "146 nonsense_mediated_decay \n", "147 protein_coding \n", "148 protein_coding \n", "149 protein_coding \n", "150 protein_coding \n", "151 protein_coding \n", "152 protein_coding \n", "153 protein_coding \n", "154 nonsense_mediated_decay \n", "155 nonsense_mediated_decay \n", "156 nonsense_mediated_decay \n", "157 nonsense_mediated_decay \n", "158 nonsense_mediated_decay \n", "159 protein_coding \n", "160 nonsense_mediated_decay \n", "161 protein_coding \n", "\n", " Protein sequence Protein length \\\n", "0 b'MTENSTSAPAAKPKRAKASKKSTDHPKYSDMIVAAIQAEKNRAG... 194 \n", "1 b'MSETVPPAPAASAAPEKPLAGKKAKKPAKAAAASKKKPAGPSVS... 215 \n", "2 b'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELI... 213 \n", "3 b'MSETAPLAPTIPAPAEKTPVKKKAKKAGATAGKRKASGPPVSEL... 221 \n", "4 b'MSETAPAAPAAPAPAEKTPVKKKARKSAGAAKRKASGPPVSELI... 219 \n", "5 b'MSETAPAETATPAPVEKSPAKKKATKKAAGAGAAKRKATGPPVS... 226 \n", "6 b'MSETVPAASASAGVAAMEKLPTKKRGRKPAGLISASRKVPNLSV... 207 \n", "7 b'MEQALTGEAQSRWPRRGGSGAMAEAPGPSGESRGHSATQLPAEK... 255 \n", "8 b'MAPGSVTSDISPSSTSTAGSSRSPESEKPGPSHGGVPPGGPSHS... 346 \n", "9 b'MAPATAPRRAGEAKGKGPKKPSEAKEDPPNVGKVKKAAKRPAKV... 207 \n", "10 b'{\"error\":\"ID \\'nan\\' not found\"}' 30 \n", "11 b'MSVELEEALPVTTAEGMAKKVTKAGGSAALSPSKKRKNSKKKNQ... 213 \n", "12 b'MSGRGKQGGKARAKSKSRSSRAGLQFPVGRIHRLLRKGNYAERI... 131 \n", "13 b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV... 130 \n", "14 b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... 130 \n", "15 b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... 130 \n", "16 b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... 130 \n", "17 b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV... 130 \n", "18 b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV... 130 \n", "19 b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... 130 \n", "20 b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... 128 \n", "21 b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... 130 \n", "22 b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... 128 \n", "23 b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... 130 \n", "24 b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... 130 \n", "25 b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... 130 \n", "26 b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... 130 \n", "27 b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... 130 \n", "28 b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... 129 \n", "29 b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... 130 \n", ".. ... ... \n", "132 b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... 151 \n", "133 b'MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYRP... 136 \n", "134 b'MARTKQTARKSTGGKAPRKQLATKAARKSTPSTCGVKPHRYRPG... 135 \n", "135 b'MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRR... 140 \n", "136 b'MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRR... 114 \n", "137 b'MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRR... 86 \n", "138 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "139 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "140 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "141 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "142 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "143 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "144 b'MSVRGKAGKGLGKGGAKCHRKVLSDNIQGITKCTIRRLARHGGV... 98 \n", "145 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "146 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "147 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "148 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "149 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "150 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "151 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "152 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "153 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "154 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "155 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "156 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "157 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "158 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "159 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "160 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "161 b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... 103 \n", "\n", " References \n", "0 26689747 \n", "1 26689747 \n", "2 26689747 \n", "3 26689747 \n", "4 26689747 \n", "5 26689747 \n", "6 26689747 \n", "7 26689747 \n", "8 26689747 \n", "9 26689747 \n", "10 12920187 26689747 17852044 \n", "11 26689747 \n", "12 2011515 7068607 24506885 \n", "13 12408966 25731851 \n", "14 12408966 25731851 \n", "15 12408966 25731851 \n", "16 12408966 25731851 \n", "17 12408966 25731851 \n", "18 12408966 25731851 \n", "19 12408966 25731851 \n", "20 12408966 25731851 \n", "21 12408966 25731851 \n", "22 12408966 25731851 \n", "23 12408966 25731851 \n", "24 12408966 25731851 \n", "25 12408966 25731851 \n", "26 12408966 25731851 \n", "27 12408966 25731851 \n", "28 12408966 25731851 \n", "29 12408966 25731851 \n", ".. ... \n", "132 19412883 \n", "133 8986613 \n", "134 21274551 \n", "135 23324462 \n", "136 23324462 \n", "137 23324462 \n", "138 12408966 \n", "139 12408966 \n", "140 12408966 \n", "141 12408966 \n", "142 12408966 \n", "143 12408966 \n", "144 12408966 \n", "145 12408966 \n", "146 12408966 \n", "147 12408966 \n", "148 12408966 \n", "149 12408966 \n", "150 12408966 \n", "151 12408966 \n", "152 12408966 \n", "153 12408966 \n", "154 12408966 \n", "155 12408966 \n", "156 12408966 \n", "157 12408966 \n", "158 12408966 \n", "159 12408966 \n", "160 12408966 \n", "161 12408966 \n", "\n", "[162 rows x 13 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "histone_proteins" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# hp=(histone_proteins[histone_proteins['Transcript type'].isin(['protein_coding'])]).drop_duplicates(subset=['Transcript stable ID'])\n", "hp=(histone_proteins[histone_proteins['Transcript type'].isin(['protein_coding',np.nan])]).sort_values('RefSeq peptide ID', ascending=True).drop_duplicates(subset=['Ensembl gene ID','Protein sequence']).sort_index()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Histone typeHistone variantHGNC symbolNCBI gene IDEnsembl gene IDTranscript stable IDProtein stable IDRefSeq mRNA IDRefSeq peptide IDTranscript typeProtein sequenceProtein lengthReferences
0H1H1.0H1-03005ENSG00000189060ENST00000340857ENSP00000344504NM_005318NP_005309protein_codingb'MTENSTSAPAAKPKRAKASKKSTDHPKYSDMIVAAIQAEKNRAG...19426689747
1H1H1.1H1-13024ENSG00000124610ENST00000244573ENSP00000244573NM_005325NP_005316protein_codingb'MSETVPPAPAASAAPEKPLAGKKAKKPAKAAAASKKKPAGPSVS...21526689747
2H1H1.2H1-23006ENSG00000187837ENST00000343677ENSP00000339566NM_005319NP_005310protein_codingb'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELI...21326689747
3H1H1.3H1-33007ENSG00000124575ENST00000244534ENSP00000244534NM_005320NP_005311protein_codingb'MSETAPLAPTIPAPAEKTPVKKKAKKAGATAGKRKASGPPVSEL...22126689747
4H1H1.4H1-43008ENSG00000168298ENST00000304218ENSP00000307705NM_005321NP_005312protein_codingb'MSETAPAAPAAPAPAEKTPVKKKARKSAGAAKRKASGPPVSELI...21926689747
5H1H1.5H1-53009ENSG00000184357ENST00000331442ENSP00000330074NM_005322NP_005313protein_codingb'MSETAPAETATPAPVEKSPAKKKATKKAAGAGAAKRKATGPPVS...22626689747
6H1TS H1.6H1-63010ENSG00000187475ENST00000338379ENSP00000341214NM_005323NP_005314protein_codingb'MSETVPAASASAGVAAMEKLPTKKRGRKPAGLISASRKVPNLSV...20726689747
7H1TS H1.7H1-7341567ENSG00000187166ENST00000335017ENSP00000334805NM_181788NP_861453protein_codingb'MEQALTGEAQSRWPRRGGSGAMAEAPGPSGESRGHSATQLPAEK...25526689747
8H1OO H1.8H1-8132243ENSG00000178804ENST00000324382ENSP00000319799NM_153833NP_722575protein_codingb'MAPGSVTSDISPSSTSTAGSSRSPESEKPGPSHGGVPPGGPSHS...34626689747
9H1OO H1.8H1-8132243ENSG00000178804ENST00000503977ENSP00000422964NM_001308262NP_001295191protein_codingb'MAPATAPRRAGEAKGKGPKKPSEAKEDPPNVGKVKKAAKRPAKV...20726689747
10H1TS H1.9(?)H1-9P373861ENSG00000188662NaNNaNNaNNaNNaNb'{\"error\":\"ID \\'nan\\' not found\"}'3012920187 26689747 17852044
11H1H1.10H1-108971ENSG00000184897ENST00000333762ENSP00000329662NM_006026NP_006017protein_codingb'MSVELEEALPVTTAEGMAKKVTKAGGSAALSPSKKRKNSKKKNQ...21326689747
12H2ATS H2A.1H2AC1221613ENSG00000164508ENST00000297012ENSP00000297012NM_170745NP_734466protein_codingb'MSGRGKQGGKARAKSKSRSSRAGLQFPVGRIHRLLRKGNYAERI...1312011515 7068607 24506885
13H2Acanonical H2AH2AC48335ENSG00000278463ENST00000615868ENSP00000483842NM_003513NP_003504protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV...13012408966 25731851
16H2Acanonical H2AH2AC68334ENSG00000180573ENST00000377791ENSP00000367022NM_003512NP_003503protein_codingb'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
17H2Acanonical H2AH2AC73013ENSG00000196866ENST00000341023ENSP00000341094NM_021065NP_066409protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV...13012408966 25731851
18H2Acanonical H2AH2AC83012ENSG00000277075ENST00000303910ENSP00000303373NM_021052NP_066390protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV...13012408966 25731851
19H2Acanonical H2AH2AC118969ENSG00000196787ENST00000359193ENSP00000352119NM_021064NP_066408protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
20H2Acanonical H2AH2AC1285235ENSG00000274997ENST00000377459ENSP00000366679NM_080596NP_542163protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV...12812408966 25731851
21H2Acanonical H2AH2AC138329ENSG00000196747ENST00000358739ENSP00000351589NM_003509NP_003500protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
22H2Acanonical H2AH2AC148331ENSG00000276368ENST00000333151ENSP00000328484NM_021066NP_066544protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV...12812408966 25731851
23H2Acanonical H2AH2AC158330ENSG00000275221ENST00000618958ENSP00000482431NM_003510NP_003501protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
24H2Acanonical H2AH2AC168332ENSG00000276903ENST00000613174ENSP00000482538NM_003511NP_003502protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
25H2Acanonical H2AH2AC178336ENSG00000278677ENST00000359611ENSP00000352627NM_003514NP_003505protein_codingb'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
26H2Acanonical H2AH2AC188337ENSG00000203812ENST00000369159ENSP00000358155NM_003516NP_003507protein_codingb'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
27H2Acanonical H2AH2AC19723790ENSG00000272196ENST00000607355ENSP00000475814NM_001040874NP_001035807protein_codingb'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
28H2Acanonical H2AH2AC208338ENSG00000184260ENST00000331380ENSP00000332194NM_003517NP_003508protein_codingb'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV...12912408966 25731851
29H2Acanonical H2AH2AC21317772ENSG00000184270ENST00000331128ENSP00000332790NM_175065NP_778235protein_codingb'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV...13012408966 25731851
30H2AH2A.J(?)H2AJ55766ENSG00000246705ENST00000544848ENSP00000438553NM_177925NP_808760protein_codingb'MSGRGKQGGKVRAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV...12925731851
33H2Acanonical H2AH2AW92815ENSG00000181218ENST00000366695ENSP00000355656NM_033445NP_254280protein_codingb'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYSERV...130?
..........................................
112H3H3.Y.1H3Y1391769ENSG00000269466ENST00000598383ENSP00000496014NM_001355258NP_001342187protein_codingb'MARTKQTARKATAWQAPRKPLATKAAGKRAPPTGGIKKPHRYKP...13620819935
113H3H3.Y.2H3Y2340096ENSG00000268799ENST00000600799ENSP00000497053NM_001371919NP_001358848protein_codingb'MARTKQTARKATAWQAPRKPLATKAARKRASPTGGIKKPHRYKP...14720819935
115H3canonical H3(?)H3-2440686ENSG00000273213ENST00000609879ENSP00000499501NM_001355409NP_001342338protein_codingb'MARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYRP...13612408966
118H3H3.3H3-3A3020ENSG00000163041ENST00000366814ENSP00000355779NaNNaNprotein_codingb'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP...12319412883
119H3H3.3H3-3A3020ENSG00000163041ENST00000366815ENSP00000355780NM_002107NP_002098protein_codingb'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP...13619412883
121H3H3.3H3-3A3020ENSG00000163041ENST00000667897ENSP00000499446NaNNaNprotein_codingb'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP...12019412883
125H3H3.3H3-3B3021ENSG00000132475ENST00000254810ENSP00000254810NM_005324NP_005315protein_codingb'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP...13619412883
126H3H3.3H3-3B3021ENSG00000132475ENST00000592643ENSP00000467165NaNNaNprotein_codingb'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP...11319412883
127H3H3.3H3-3B3021ENSG00000132475ENST00000591890ENSP00000466663NaNNaNprotein_codingb'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP...9219412883
131H3H3.3H3-3B3021ENSG00000132475ENST00000586270ENSP00000465403NaNNaNprotein_codingb'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP...13219412883
132H3H3.3H3-3B3021ENSG00000132475ENST00000587171ENSP00000468484NaNNaNprotein_codingb'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP...15119412883
133H3TS H3.4H3-48290ENSG00000168148ENST00000366696ENSP00000355657NM_003493NP_003484protein_codingb'MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYRP...1368986613
134H3H3.5H3-5440093ENSG00000188375ENST00000340398ENSP00000339835NM_001013699NP_001013721protein_codingb'MARTKQTARKSTGGKAPRKQLATKAARKSTPSTCGVKPHRYRPG...13521274551
135H3cenH3CENPA1058ENSG00000115163ENST00000335756ENSP00000336868NM_001809NP_001800protein_codingb'MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRR...14023324462
136H3cenH3CENPA1058ENSG00000115163ENST00000233505ENSP00000233505NM_001042426NP_001035891protein_codingb'MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRR...11423324462
138H4canonical H4H4C18359ENSG00000278637ENST00000617569ENSP00000479106NM_003538NP_003529protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
139H4canonical H4H4C28366ENSG00000278705ENST00000377745ENSP00000366974NM_003544NP_003535protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
140H4canonical H4H4C38364ENSG00000197061ENST00000377803ENSP00000367034NM_003542NP_003533protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
141H4canonical H4H4C48360ENSG00000277157ENST00000614247ENSP00000479461NM_003539NP_003530protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
142H4canonical H4H4C58367ENSG00000276966ENST00000615164ENSP00000484789NM_003545NP_003536protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
143H4canonical H4H4C68361ENSG00000274618ENST00000244537ENSP00000244537NM_003540NP_003531protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
144H4canonical H4H4C78369ENSG00000275663ENST00000611444ENSP00000477870NM_003547NP_003538protein_codingb'MSVRGKAGKGLGKGGAKCHRKVLSDNIQGITKCTIRRLARHGGV...9812408966
147H4canonical H4H4C88365ENSG00000158406ENST00000377727ENSP00000366956NM_003543NP_003534protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
149H4canonical H4H4C98294ENSG00000276180ENST00000615353ENSP00000481486NM_003495NP_003486protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
150H4canonical H4H4C118363ENSG00000197238ENST00000355057ENSP00000347168NM_021968NP_068803protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
151H4canonical H4H4C128362ENSG00000273542ENST00000611927ENSP00000479794NM_003541NP_003532protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
152H4canonical H4H4C138368ENSG00000275126ENST00000618305ENSP00000480960NM_003546NP_003537protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
153H4canonical H4H4C148370ENSG00000270882ENST00000578186ENSP00000462667NM_003548NP_003539protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
159H4canonical H4H4C15554313ENSG00000270276ENST00000579512ENSP00000462355NM_001034077NP_001029249protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
161H4canonical H4H4-16121504ENSG00000197837ENST00000539745ENSP00000443017NM_175054NP_778224protein_codingb'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV...10312408966
\n", "

120 rows × 13 columns

\n", "
" ], "text/plain": [ " Histone type Histone variant HGNC symbol NCBI gene ID Ensembl gene ID \\\n", "0 H1 H1.0 H1-0 3005 ENSG00000189060 \n", "1 H1 H1.1 H1-1 3024 ENSG00000124610 \n", "2 H1 H1.2 H1-2 3006 ENSG00000187837 \n", "3 H1 H1.3 H1-3 3007 ENSG00000124575 \n", "4 H1 H1.4 H1-4 3008 ENSG00000168298 \n", "5 H1 H1.5 H1-5 3009 ENSG00000184357 \n", "6 H1 TS H1.6 H1-6 3010 ENSG00000187475 \n", "7 H1 TS H1.7 H1-7 341567 ENSG00000187166 \n", "8 H1 OO H1.8 H1-8 132243 ENSG00000178804 \n", "9 H1 OO H1.8 H1-8 132243 ENSG00000178804 \n", "10 H1 TS H1.9(?) H1-9P 373861 ENSG00000188662 \n", "11 H1 H1.10 H1-10 8971 ENSG00000184897 \n", "12 H2A TS H2A.1 H2AC1 221613 ENSG00000164508 \n", "13 H2A canonical H2A H2AC4 8335 ENSG00000278463 \n", "16 H2A canonical H2A H2AC6 8334 ENSG00000180573 \n", "17 H2A canonical H2A H2AC7 3013 ENSG00000196866 \n", "18 H2A canonical H2A H2AC8 3012 ENSG00000277075 \n", "19 H2A canonical H2A H2AC11 8969 ENSG00000196787 \n", "20 H2A canonical H2A H2AC12 85235 ENSG00000274997 \n", "21 H2A canonical H2A H2AC13 8329 ENSG00000196747 \n", "22 H2A canonical H2A H2AC14 8331 ENSG00000276368 \n", "23 H2A canonical H2A H2AC15 8330 ENSG00000275221 \n", "24 H2A canonical H2A H2AC16 8332 ENSG00000276903 \n", "25 H2A canonical H2A H2AC17 8336 ENSG00000278677 \n", "26 H2A canonical H2A H2AC18 8337 ENSG00000203812 \n", "27 H2A canonical H2A H2AC19 723790 ENSG00000272196 \n", "28 H2A canonical H2A H2AC20 8338 ENSG00000184260 \n", "29 H2A canonical H2A H2AC21 317772 ENSG00000184270 \n", "30 H2A H2A.J(?) H2AJ 55766 ENSG00000246705 \n", "33 H2A canonical H2A H2AW 92815 ENSG00000181218 \n", ".. ... ... ... ... ... \n", "112 H3 H3.Y.1 H3Y1 391769 ENSG00000269466 \n", "113 H3 H3.Y.2 H3Y2 340096 ENSG00000268799 \n", "115 H3 canonical H3(?) H3-2 440686 ENSG00000273213 \n", "118 H3 H3.3 H3-3A 3020 ENSG00000163041 \n", "119 H3 H3.3 H3-3A 3020 ENSG00000163041 \n", "121 H3 H3.3 H3-3A 3020 ENSG00000163041 \n", "125 H3 H3.3 H3-3B 3021 ENSG00000132475 \n", "126 H3 H3.3 H3-3B 3021 ENSG00000132475 \n", "127 H3 H3.3 H3-3B 3021 ENSG00000132475 \n", "131 H3 H3.3 H3-3B 3021 ENSG00000132475 \n", "132 H3 H3.3 H3-3B 3021 ENSG00000132475 \n", "133 H3 TS H3.4 H3-4 8290 ENSG00000168148 \n", "134 H3 H3.5 H3-5 440093 ENSG00000188375 \n", "135 H3 cenH3 CENPA 1058 ENSG00000115163 \n", "136 H3 cenH3 CENPA 1058 ENSG00000115163 \n", "138 H4 canonical H4 H4C1 8359 ENSG00000278637 \n", "139 H4 canonical H4 H4C2 8366 ENSG00000278705 \n", "140 H4 canonical H4 H4C3 8364 ENSG00000197061 \n", "141 H4 canonical H4 H4C4 8360 ENSG00000277157 \n", "142 H4 canonical H4 H4C5 8367 ENSG00000276966 \n", "143 H4 canonical H4 H4C6 8361 ENSG00000274618 \n", "144 H4 canonical H4 H4C7 8369 ENSG00000275663 \n", "147 H4 canonical H4 H4C8 8365 ENSG00000158406 \n", "149 H4 canonical H4 H4C9 8294 ENSG00000276180 \n", "150 H4 canonical H4 H4C11 8363 ENSG00000197238 \n", "151 H4 canonical H4 H4C12 8362 ENSG00000273542 \n", "152 H4 canonical H4 H4C13 8368 ENSG00000275126 \n", "153 H4 canonical H4 H4C14 8370 ENSG00000270882 \n", "159 H4 canonical H4 H4C15 554313 ENSG00000270276 \n", "161 H4 canonical H4 H4-16 121504 ENSG00000197837 \n", "\n", " Transcript stable ID Protein stable ID RefSeq mRNA ID RefSeq peptide ID \\\n", "0 ENST00000340857 ENSP00000344504 NM_005318 NP_005309 \n", "1 ENST00000244573 ENSP00000244573 NM_005325 NP_005316 \n", "2 ENST00000343677 ENSP00000339566 NM_005319 NP_005310 \n", "3 ENST00000244534 ENSP00000244534 NM_005320 NP_005311 \n", "4 ENST00000304218 ENSP00000307705 NM_005321 NP_005312 \n", "5 ENST00000331442 ENSP00000330074 NM_005322 NP_005313 \n", "6 ENST00000338379 ENSP00000341214 NM_005323 NP_005314 \n", "7 ENST00000335017 ENSP00000334805 NM_181788 NP_861453 \n", "8 ENST00000324382 ENSP00000319799 NM_153833 NP_722575 \n", "9 ENST00000503977 ENSP00000422964 NM_001308262 NP_001295191 \n", "10 NaN NaN NaN NaN \n", "11 ENST00000333762 ENSP00000329662 NM_006026 NP_006017 \n", "12 ENST00000297012 ENSP00000297012 NM_170745 NP_734466 \n", "13 ENST00000615868 ENSP00000483842 NM_003513 NP_003504 \n", "16 ENST00000377791 ENSP00000367022 NM_003512 NP_003503 \n", "17 ENST00000341023 ENSP00000341094 NM_021065 NP_066409 \n", "18 ENST00000303910 ENSP00000303373 NM_021052 NP_066390 \n", "19 ENST00000359193 ENSP00000352119 NM_021064 NP_066408 \n", "20 ENST00000377459 ENSP00000366679 NM_080596 NP_542163 \n", "21 ENST00000358739 ENSP00000351589 NM_003509 NP_003500 \n", "22 ENST00000333151 ENSP00000328484 NM_021066 NP_066544 \n", "23 ENST00000618958 ENSP00000482431 NM_003510 NP_003501 \n", "24 ENST00000613174 ENSP00000482538 NM_003511 NP_003502 \n", "25 ENST00000359611 ENSP00000352627 NM_003514 NP_003505 \n", "26 ENST00000369159 ENSP00000358155 NM_003516 NP_003507 \n", "27 ENST00000607355 ENSP00000475814 NM_001040874 NP_001035807 \n", "28 ENST00000331380 ENSP00000332194 NM_003517 NP_003508 \n", "29 ENST00000331128 ENSP00000332790 NM_175065 NP_778235 \n", "30 ENST00000544848 ENSP00000438553 NM_177925 NP_808760 \n", "33 ENST00000366695 ENSP00000355656 NM_033445 NP_254280 \n", ".. ... ... ... ... \n", "112 ENST00000598383 ENSP00000496014 NM_001355258 NP_001342187 \n", "113 ENST00000600799 ENSP00000497053 NM_001371919 NP_001358848 \n", "115 ENST00000609879 ENSP00000499501 NM_001355409 NP_001342338 \n", "118 ENST00000366814 ENSP00000355779 NaN NaN \n", "119 ENST00000366815 ENSP00000355780 NM_002107 NP_002098 \n", "121 ENST00000667897 ENSP00000499446 NaN NaN \n", "125 ENST00000254810 ENSP00000254810 NM_005324 NP_005315 \n", "126 ENST00000592643 ENSP00000467165 NaN NaN \n", "127 ENST00000591890 ENSP00000466663 NaN NaN \n", "131 ENST00000586270 ENSP00000465403 NaN NaN \n", "132 ENST00000587171 ENSP00000468484 NaN NaN \n", "133 ENST00000366696 ENSP00000355657 NM_003493 NP_003484 \n", "134 ENST00000340398 ENSP00000339835 NM_001013699 NP_001013721 \n", "135 ENST00000335756 ENSP00000336868 NM_001809 NP_001800 \n", "136 ENST00000233505 ENSP00000233505 NM_001042426 NP_001035891 \n", "138 ENST00000617569 ENSP00000479106 NM_003538 NP_003529 \n", "139 ENST00000377745 ENSP00000366974 NM_003544 NP_003535 \n", "140 ENST00000377803 ENSP00000367034 NM_003542 NP_003533 \n", "141 ENST00000614247 ENSP00000479461 NM_003539 NP_003530 \n", "142 ENST00000615164 ENSP00000484789 NM_003545 NP_003536 \n", "143 ENST00000244537 ENSP00000244537 NM_003540 NP_003531 \n", "144 ENST00000611444 ENSP00000477870 NM_003547 NP_003538 \n", "147 ENST00000377727 ENSP00000366956 NM_003543 NP_003534 \n", "149 ENST00000615353 ENSP00000481486 NM_003495 NP_003486 \n", "150 ENST00000355057 ENSP00000347168 NM_021968 NP_068803 \n", "151 ENST00000611927 ENSP00000479794 NM_003541 NP_003532 \n", "152 ENST00000618305 ENSP00000480960 NM_003546 NP_003537 \n", "153 ENST00000578186 ENSP00000462667 NM_003548 NP_003539 \n", "159 ENST00000579512 ENSP00000462355 NM_001034077 NP_001029249 \n", "161 ENST00000539745 ENSP00000443017 NM_175054 NP_778224 \n", "\n", " Transcript type Protein sequence \\\n", "0 protein_coding b'MTENSTSAPAAKPKRAKASKKSTDHPKYSDMIVAAIQAEKNRAG... \n", "1 protein_coding b'MSETVPPAPAASAAPEKPLAGKKAKKPAKAAAASKKKPAGPSVS... \n", "2 protein_coding b'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELI... \n", "3 protein_coding b'MSETAPLAPTIPAPAEKTPVKKKAKKAGATAGKRKASGPPVSEL... \n", "4 protein_coding b'MSETAPAAPAAPAPAEKTPVKKKARKSAGAAKRKASGPPVSELI... \n", "5 protein_coding b'MSETAPAETATPAPVEKSPAKKKATKKAAGAGAAKRKATGPPVS... \n", "6 protein_coding b'MSETVPAASASAGVAAMEKLPTKKRGRKPAGLISASRKVPNLSV... \n", "7 protein_coding b'MEQALTGEAQSRWPRRGGSGAMAEAPGPSGESRGHSATQLPAEK... \n", "8 protein_coding b'MAPGSVTSDISPSSTSTAGSSRSPESEKPGPSHGGVPPGGPSHS... \n", "9 protein_coding b'MAPATAPRRAGEAKGKGPKKPSEAKEDPPNVGKVKKAAKRPAKV... \n", "10 NaN b'{\"error\":\"ID \\'nan\\' not found\"}' \n", "11 protein_coding b'MSVELEEALPVTTAEGMAKKVTKAGGSAALSPSKKRKNSKKKNQ... \n", "12 protein_coding b'MSGRGKQGGKARAKSKSRSSRAGLQFPVGRIHRLLRKGNYAERI... \n", "13 protein_coding b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV... \n", "16 protein_coding b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... \n", "17 protein_coding b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV... \n", "18 protein_coding b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV... \n", "19 protein_coding b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... \n", "20 protein_coding b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... \n", "21 protein_coding b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... \n", "22 protein_coding b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... \n", "23 protein_coding b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... \n", "24 protein_coding b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... \n", "25 protein_coding b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... \n", "26 protein_coding b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... \n", "27 protein_coding b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... \n", "28 protein_coding b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... \n", "29 protein_coding b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... \n", "30 protein_coding b'MSGRGKQGGKVRAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... \n", "33 protein_coding b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYSERV... \n", ".. ... ... \n", "112 protein_coding b'MARTKQTARKATAWQAPRKPLATKAAGKRAPPTGGIKKPHRYKP... \n", "113 protein_coding b'MARTKQTARKATAWQAPRKPLATKAARKRASPTGGIKKPHRYKP... \n", "115 protein_coding b'MARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYRP... \n", "118 protein_coding b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... \n", "119 protein_coding b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... \n", "121 protein_coding b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... \n", "125 protein_coding b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... \n", "126 protein_coding b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... \n", "127 protein_coding b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... \n", "131 protein_coding b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... \n", "132 protein_coding b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... \n", "133 protein_coding b'MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYRP... \n", "134 protein_coding b'MARTKQTARKSTGGKAPRKQLATKAARKSTPSTCGVKPHRYRPG... \n", "135 protein_coding b'MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRR... \n", "136 protein_coding b'MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRR... \n", "138 protein_coding b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... \n", "139 protein_coding b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... \n", "140 protein_coding b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... \n", "141 protein_coding b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... \n", "142 protein_coding b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... \n", "143 protein_coding b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... \n", "144 protein_coding b'MSVRGKAGKGLGKGGAKCHRKVLSDNIQGITKCTIRRLARHGGV... \n", "147 protein_coding b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... \n", "149 protein_coding b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... \n", "150 protein_coding b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... \n", "151 protein_coding b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... \n", "152 protein_coding b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... \n", "153 protein_coding b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... \n", "159 protein_coding b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... \n", "161 protein_coding b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... \n", "\n", " Protein length References \n", "0 194 26689747 \n", "1 215 26689747 \n", "2 213 26689747 \n", "3 221 26689747 \n", "4 219 26689747 \n", "5 226 26689747 \n", "6 207 26689747 \n", "7 255 26689747 \n", "8 346 26689747 \n", "9 207 26689747 \n", "10 30 12920187 26689747 17852044 \n", "11 213 26689747 \n", "12 131 2011515 7068607 24506885 \n", "13 130 12408966 25731851 \n", "16 130 12408966 25731851 \n", "17 130 12408966 25731851 \n", "18 130 12408966 25731851 \n", "19 130 12408966 25731851 \n", "20 128 12408966 25731851 \n", "21 130 12408966 25731851 \n", "22 128 12408966 25731851 \n", "23 130 12408966 25731851 \n", "24 130 12408966 25731851 \n", "25 130 12408966 25731851 \n", "26 130 12408966 25731851 \n", "27 130 12408966 25731851 \n", "28 129 12408966 25731851 \n", "29 130 12408966 25731851 \n", "30 129 25731851 \n", "33 130 ? \n", ".. ... ... \n", "112 136 20819935 \n", "113 147 20819935 \n", "115 136 12408966 \n", "118 123 19412883 \n", "119 136 19412883 \n", "121 120 19412883 \n", "125 136 19412883 \n", "126 113 19412883 \n", "127 92 19412883 \n", "131 132 19412883 \n", "132 151 19412883 \n", "133 136 8986613 \n", "134 135 21274551 \n", "135 140 23324462 \n", "136 114 23324462 \n", "138 103 12408966 \n", "139 103 12408966 \n", "140 103 12408966 \n", "141 103 12408966 \n", "142 103 12408966 \n", "143 103 12408966 \n", "144 98 12408966 \n", "147 103 12408966 \n", "149 103 12408966 \n", "150 103 12408966 \n", "151 103 12408966 \n", "152 103 12408966 \n", "153 103 12408966 \n", "159 103 12408966 \n", "161 103 12408966 \n", "\n", "[120 rows x 13 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hp" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "hp.drop(columns=['Protein sequence','Transcript type','Protein stable ID']).to_csv('human_histone_proteins_autogenerated.csv',index=False)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "!cp human_histone_proteins_autogenerated.csv docs/human_histones.csv" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Already up to date.\r\n" ] } ], "source": [ "!git pull" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "On branch master\n", "Your branch is up to date with 'origin/master'.\n", "\n", "nothing to commit, working tree clean\n", "Everything up-to-date\n" ] } ], "source": [ "!gacp" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "dataset.list_attributes().to_csv('atr.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:genomics]", "language": "python", "name": "conda-env-genomics-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.6" } }, "nbformat": 4, "nbformat_minor": 2 }