{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "**Notebook to add human histones to DataBase, used once. To delete before final merge**" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import requests\n", "\n", "from Bio import AlignIO\n", "from Bio import SeqIO\n", "from Bio.Align.Applications import MuscleCommandline\n", "\n", "from Bio.SeqRecord import SeqRecord\n", "from Bio.Seq import Seq\n", "\n", "import subprocess" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "path = os.getcwd()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "hist_prot = pd.read_csv(os.path.join(path, \"static/human_hist/info/\", \"human_hist_proteins.csv\"))\n", "hist_genes = pd.read_csv(os.path.join(path, \"static/human_hist/info/\", \"human_hist_genes.csv\"), usecols=['Histone type', 'Histone variant', 'HGNC Symbol'])\n", "\n", "hist_prot = pd.merge(hist_prot, hist_genes.drop_duplicates(), on='HGNC Symbol', how='left')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Add human histones to histone variant fasta" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "var_dict = {\n", "'H1.0':'H1.0',\n", "'H1.1':'H1.1',#new\n", "'H1.2':'H1.2',#new\n", "'H1.3':'H1.3',#new\n", "'H1.4':'H1.4',#new\n", "'H1.5':'H1.5',#new\n", "'TS H1.6':'TS_H1.6',\n", "'TS H1.7':'TS_H1.7',\n", "'OO H1.8':'OO_H1.8',\n", "'TS H1.9(?)':'TS_H1.9',\n", "'H1.10':'H1.10',\n", "'TS H2A.1':'H2A.1',\n", "'canonical H2A':'canonical_H2A',\n", "'H2A.J(?)':'H2A.J',#new\n", "'H2A.X':'H2A.X',\n", "'H2A.Z.1':'H2A.Z',\n", "'H2A.Z.2':'H2A.Z',\n", "'macroH2A.1':'macroH2A',\n", "'macroH2A.2':'macroH2A',\n", "'H2A.B.1':'H2A.B',\n", "'H2A.B.2':'H2A.B',\n", "'H2A.P':'H2A.P',\n", "'TS H2B.1':'H2B.1', \n", "'canonical H2B':'canonical_H2B',\n", "'canonical H2B(?)':'canonical_H2B',\n", "'H2B.S(?)':'H2B.S',#new\n", "'H2B.W':'H2B.W',\n", "# '?':'', #new,\n", "'canonical H3.1':'canonical_H3',\n", "'canonical H3.2':'canonical_H3',\n", "'H3.Y.1':'H3.Y',\n", "'H3.Y.2':'H3.Y',\n", "'canonical H3(?)':'canonical_H3',\n", "'H3.3':'H3.3',\n", "'TS H3.4':'TS_H3.4',\n", "'H3.5':'H3.5',\n", "'cenH3':'cenH3',\n", "'canonical H4':'canonical_H4',\n", "}\n", "\n", "var_dict_reverse = {v: k for k, v in var_dict.items()}" ] }, { "cell_type": "code", "execution_count": 324, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "512" ] }, "execution_count": 324, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.system('>static/browse/seeds/H1/H1.1.fasta')\n", "os.system('>static/browse/seeds/H1/H1.2.fasta')\n", "os.system('>static/browse/seeds/H1/H1.3.fasta')\n", "os.system('>static/browse/seeds/H1/H1.4.fasta')\n", "os.system('>static/browse/seeds/H1/H1.5.fasta')\n", "os.system('>static/browse/seeds/H2A/H2A.J.fasta')\n", "os.system('>static/browse/seeds/H2B/H2B.S.fasta')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "var_type_dict = {}\n", "\n", "for hist_type in hist_prot['Histone type'].unique():\n", " var_list = list(hist_prot['Histone variant'].loc[hist_prot['Histone type']==hist_type].unique())\n", " var_type_dict[hist_type]=var_list" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'H1': ['H1.0',\n", " 'H1.1',\n", " 'H1.2',\n", " 'H1.3',\n", " 'H1.4',\n", " 'H1.5',\n", " 'TS H1.6',\n", " 'TS H1.7',\n", " 'OO H1.8',\n", " 'TS H1.9(?)',\n", " 'H1.10'],\n", " 'H2A': ['TS H2A.1',\n", " 'canonical H2A',\n", " 'H2A.J(?)',\n", " 'H2A.X',\n", " 'H2A.Z.1',\n", " 'H2A.Z.2',\n", " 'macroH2A.1',\n", " 'macroH2A.2',\n", " 'H2A.B.1',\n", " 'H2A.B.2',\n", " 'H2A.P'],\n", " 'H2B': ['TS H2B.1',\n", " 'canonical H2B',\n", " 'canonical H2B(?)',\n", " 'H2B.S(?)',\n", " 'H2B.W',\n", " '?'],\n", " 'H3': ['canonical H3.1',\n", " 'canonical H3.2',\n", " 'H3.Y.1',\n", " 'H3.Y.2',\n", " 'canonical H3(?)',\n", " 'H3.3',\n", " 'TS H3.4',\n", " 'H3.5',\n", " 'cenH3'],\n", " 'H4': ['canonical H4']}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "var_type_dict" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "set()" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set(var_dict.keys()) - set(hist_prot['Histone variant'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'?'}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set(hist_prot['Histone variant']) - set(var_dict.keys())" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 451 ms, sys: 69.2 ms, total: 521 ms\n", "Wall time: 2min 43s\n" ] } ], "source": [ "%%time\n", "\n", "seqs=[]\n", "\n", "for i in hist_prot['Protein stable ID']:\n", " seq=requests.get('http://rest.ensembl.org/sequence/id/%s?content-type=text/plain'%i).content\n", " seqs.append(seq.decode(\"utf-8\"))\n", " \n", "hist_seq = dict(zip(hist_prot['RefSeq peptide ID'],seqs))\n", "hist_prot['seq']=seqs" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "120" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(hist_prot)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "105" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(hist_seq)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "NP_001157888 1\n", "NP_003518 1\n", "NP_003537 1\n", "NP_734466 1\n", "NP_005311 1\n", " ..\n", "NP_066544 1\n", "NP_001013721 1\n", "NP_003514 1\n", "NP_958925 1\n", "NP_003534 1\n", "Name: RefSeq peptide ID, Length: 104, dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hist_prot['RefSeq peptide ID'].value_counts()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "0\n", "1\n", "1\n" ] } ], "source": [ "for i in ['NP_004884', 'NP_542160','NP_001299582','NP_001035248']:\n", " print( hist_prot['seq'].loc[hist_prot['RefSeq peptide ID'] == i].nunique() )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "не буду добавлять GI в описание\n", "- https://ncbiinsights.ncbi.nlm.nih.gov/2016/12/06/converting-gi-numbers-to-accession-version/" ] }, { "cell_type": "code", "execution_count": 326, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "====================\n", "---------------------\n", "H1.0\n", "NP_005309 in file\n", "---------------------\n", "H1.1\n", "NP_005316 added to new file\n", "---------------------\n", "H1.2\n", "NP_005310 added to new file\n", "---------------------\n", "H1.3\n", "NP_005311 added to new file\n", "---------------------\n", "H1.4\n", "NP_005312 added to new file\n", "---------------------\n", "H1.5\n", "NP_005313 added to new file\n", "---------------------\n", "TS H1.6\n", "NP_005314 in file\n", "---------------------\n", "TS H1.7\n", "NP_861453 in file\n", "---------------------\n", "OO H1.8\n", "NP_722575 in file\n", "NP_001295191 not in file, added\n", "---------------------\n", "TS H1.9(?)\n", "---------------------\n", "H1.10\n", "NP_006017 in file\n", "====================\n", "---------------------\n", "TS H2A.1\n", "NP_734466 in file\n", "---------------------\n", "canonical H2A\n", "NP_003504 not in file, added\n", "NP_003503 not in file, added\n", "NP_066409 not in file, added\n", "NP_066390 in file\n", "NP_066408 not in file, added\n", "NP_542163 not in file, added\n", "NP_003500 not in file, added\n", "NP_066544 not in file, added\n", "NP_003501 not in file, added\n", "NP_003502 not in file, added\n", "NP_003505 not in file, added\n", "NP_003507 not in file, added\n", "NP_001035807 not in file, added\n", "NP_003508 not in file, added\n", "NP_778235 not in file, added\n", "NP_254280 not in file, added\n", "---------------------\n", "H2A.J(?)\n", "NP_808760 added to new file\n", "---------------------\n", "H2A.X\n", "NP_002096 in file\n", "---------------------\n", "H2A.Z.1\n", "NP_002097 in file\n", "---------------------\n", "H2A.Z.2\n", "NP_619541 not in file, added\n", "NP_958925 not in file, added\n", "NP_036544 in file\n", "NP_958844 not in file, added\n", "NP_958924 not in file, added\n", "---------------------\n", "macroH2A.1\n", "NP_613258 in file\n", "NP_001035248 not in file, added\n", "NP_613075 not in file, added\n", "---------------------\n", "macroH2A.2\n", "NP_061119 in file\n", "---------------------\n", "H2A.B.1\n", "NP_001017990 in file\n", "---------------------\n", "H2A.B.2\n", "NP_001017991 not in file, added\n", "NP_542451 in file\n", "---------------------\n", "H2A.P\n", "NP_036406 in file\n", "====================\n", "---------------------\n", "TS H2B.1\n", "NP_733759 in file\n", "---------------------\n", "canonical H2B\n", "NP_066406 not in file, added\n", "NP_003517 not in file, added\n", "NP_066407 not in file, added\n", "NP_003514 not in file, added\n", "NP_003513 not in file, added\n", "NP_003516 not in file, added\n", "NP_003515 not in file, added\n", "NP_003509 not in file, added\n", "NP_066402 in file\n", "NP_001299582 not in file, added\n", "NP_003510 not in file, added\n", "NP_003512 not in file, added\n", "NP_003511 not in file, added\n", "NP_003518 not in file, added\n", "NP_001019770 not in file, added\n", "NP_001154806 not in file, added\n", "NP_778225 not in file, added\n", "---------------------\n", "canonical H2B(?)\n", "NP_003519 not in file, added\n", "---------------------\n", "H2B.S(?)\n", "NP_059141 added to new file\n", "---------------------\n", "H2B.W\n", "NP_001002916 not in file, added\n", "====================\n", "---------------------\n", "canonical H3.1\n", "NP_003520 in file\n", "NP_003528 not in file, added\n", "NP_003522 not in file, added\n", "NP_003521 not in file, added\n", "NP_003523 not in file, added\n", "NP_066298 not in file, added\n", "NP_003525 not in file, added\n", "NP_003527 not in file, added\n", "NP_003524 not in file, added\n", "NP_003526 not in file, added\n", "---------------------\n", "canonical H3.2\n", "NP_001116847 not in file, added\n", "NP_066403 not in file, added\n", "NP_001005464 not in file, added\n", "---------------------\n", "H3.Y.1\n", "NP_001342187 not in file, added\n", "---------------------\n", "H3.Y.2\n", "NP_001358848 not in file, added\n", "---------------------\n", "canonical H3(?)\n", "NP_001342338 not in file, added\n", "---------------------\n", "H3.3\n", "NP_002098 not in file, added\n", "NP_005315 not in file, added\n", "---------------------\n", "TS H3.4\n", "NP_003484 in file\n", "---------------------\n", "H3.5\n", "NP_001013721 in file\n", "---------------------\n", "cenH3\n", "NP_001800 in file\n", "NP_001035891 not in file, added\n", "====================\n", "---------------------\n", "canonical H4\n", "NP_003529 not in file, added\n", "NP_003535 not in file, added\n", "NP_003533 not in file, added\n", "NP_003530 not in file, added\n", "NP_003536 not in file, added\n", "NP_003531 not in file, added\n", "NP_003538 in file\n", "NP_003534 not in file, added\n", "NP_003486 not in file, added\n", "NP_068803 not in file, added\n", "NP_003532 not in file, added\n", "NP_003537 not in file, added\n", "NP_003539 not in file, added\n", "NP_001029249 in file\n", "NP_778224 not in file, added\n" ] } ], "source": [ "#for hist_type in ['H1', 'H2A', 'H2B', 'H3', 'H4']:\n", "\n", "added_id = []\n", "\n", "for hist_type in ['H1', 'H2A', 'H2B', 'H3', 'H4']:\n", "\n", " print('====================')\n", " for hist_var in var_type_dict[hist_type]: \n", " \n", " if hist_var!='?':\n", " \n", " print('---------------------')\n", " print(hist_var)\n", "\n", " hist_var_histdb = var_dict[hist_var] \n", "\n", " try:\n", " alignment = AlignIO.read(os.path.join(path,\"static/browse/seeds/{}/\".format(hist_type),\"{}.fasta\".format(hist_var_histdb)), \"fasta\")\n", "\n", " fasta_variant = []\n", "\n", " for record in alignment:\n", " fasta_variant.append(record.id.split('|')[1].split('.')[0])\n", "\n", " var_list = hist_prot['RefSeq peptide ID'].loc[hist_prot['Histone variant']=='{}'.format(hist_var)].values\n", "\n", "\n", " for refseq in var_list:\n", " if refseq in fasta_variant:\n", " print(f'{refseq} in file')\n", " else:\n", " if not pd.isna(refseq):\n", " print(f'{refseq} not in file, added') \n", " alignment.add_sequence(f'Homo|{refseq}|{hist_var_histdb}', hist_seq[refseq])\n", " added_id.append(refseq)\n", "\n", " SeqIO.write(alignment, os.path.join(path,\"static/browse/seeds/{}/\".format(hist_type),\"{}.fasta\".format(hist_var_histdb)), \"fasta\")\n", "\n", "\n", "\n", " except:\n", " var_list = hist_prot['RefSeq peptide ID'].loc[hist_prot['Histone variant']=='{}'.format(hist_var)].values\n", " records = []\n", " for refseq in var_list:\n", " if not pd.isna(refseq):\n", " print(f'{refseq} added to new file') \n", " record = SeqRecord(Seq(hist_seq[refseq]),\n", " id=f'Homo|{refseq}|{hist_var_histdb}', name=f'Homo|{refseq}|{hist_var_histdb}',\n", " description=f'Homo|{refseq}|{hist_var_histdb}')\n", " records.append(record)\n", " added_id.append(refseq)\n", " SeqIO.write(records, os.path.join(path,\"static/browse/seeds/{}/\".format(hist_type),\"{}.fasta\".format(hist_var_histdb)), \"fasta\")\n", "\n", "\n", " os.chdir(f'{path}/static/browse/seeds/{hist_type}/')\n", " os.system(f'muscle -in {hist_var_histdb}.fasta -out {hist_var_histdb}.fasta')" ] }, { "cell_type": "code", "execution_count": 329, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "79" ] }, "execution_count": 329, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(added_id)" ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Make Histone type fasta" ] }, { "cell_type": "code", "execution_count": 330, "metadata": {}, "outputs": [], "source": [ "os.chdir(f'{path}/static/browse/seeds')" ] }, { "cell_type": "code", "execution_count": 331, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "H1\n", "H2A\n", "H2B\n", "H3\n", "H4\n" ] } ], "source": [ "%%bash\n", "\n", "\n", "for var in H1 H2A H2B H3 H4\n", "do\n", " echo $var\n", " for file in `ls ${var}/*.fasta`\n", " do\n", " cat $file >> ${var}_temp.fasta\n", " done\n", " \n", "done;\n" ] }, { "cell_type": "code", "execution_count": 332, "metadata": {}, "outputs": [], "source": [ "for hist_type in ['H1', 'H2A', 'H2B', 'H3', 'H4']:\n", " with open(f\"{hist_type}_ungap.fasta\", \"w\") as o:\n", " for record in SeqIO.parse(f'{hist_type}_temp.fasta', 'fasta'):\n", " record.seq = record.seq.ungap(\"-\")\n", " SeqIO.write(record, o, \"fasta\")\n", " output = subprocess.run([\"muscle\", \"-in\", f\"{hist_type}_ungap.fasta\", \"-out\", f\"{hist_type}_with_human.fasta\"], capture_output=True)\n", " #subprocess.run([\"rm\", f\"{hist_type}_temp.fasta\"])\n", " \n" ] }, { "cell_type": "code", "execution_count": 333, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "\n", "for hist_type in H1 H2A H2B H3 H4\n", "do\n", " rm ${hist_type}_temp.fasta\n", " rm ${hist_type}_ungap.fasta\n", "done" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 303, "metadata": {}, "outputs": [], "source": [ "temp = pd.read_csv(os.path.join(path, \"static/human_hist/info/\", \"human_hist_genes.csv\"))" ] }, { "cell_type": "code", "execution_count": 310, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Histone typeHistone variantHGNC SymbolNCBI gene IDEnsembl gene IDExpr. timingExpr. patternBiotypeBona fide canonicalPMIDs
1H1H1.1H1-13024ENSG00000124610RDNaNCODNaN26689747
2H1H1.2H1-23006ENSG00000187837MixedNaNCODNaN26689747
3H1H1.3H1-33007ENSG00000124575RDNaNCODNaN26689747
4H1H1.4H1-43008ENSG00000168298RDNaNCODNaN26689747
5H1H1.5H1-53009ENSG00000184357RDNaNCODNaN26689747
33H2AH2A.J(?)H2AJ55766ENSG00000246705RINaNCODNaN25731851
77H2BH2B.S(?)H2BS154145ENSG00000234289RINaNCODNaN?
\n", "
" ], "text/plain": [ " Histone type Histone variant HGNC Symbol NCBI gene ID Ensembl gene ID \\\n", "1 H1 H1.1 H1-1 3024 ENSG00000124610 \n", "2 H1 H1.2 H1-2 3006 ENSG00000187837 \n", "3 H1 H1.3 H1-3 3007 ENSG00000124575 \n", "4 H1 H1.4 H1-4 3008 ENSG00000168298 \n", "5 H1 H1.5 H1-5 3009 ENSG00000184357 \n", "33 H2A H2A.J(?) H2AJ 55766 ENSG00000246705 \n", "77 H2B H2B.S(?) H2BS1 54145 ENSG00000234289 \n", "\n", " Expr. timing Expr. pattern Biotype Bona fide canonical PMIDs \n", "1 RD NaN COD NaN 26689747 \n", "2 Mixed NaN COD NaN 26689747 \n", "3 RD NaN COD NaN 26689747 \n", "4 RD NaN COD NaN 26689747 \n", "5 RD NaN COD NaN 26689747 \n", "33 RI NaN COD NaN 25731851 \n", "77 RI NaN COD NaN ? " ] }, "execution_count": 310, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp.loc[temp['Histone variant'].isin(['H1.1', 'H1.2', 'H1.3','H1.4', 'H1.5', 'H2A.J(?)', 'H2B.S(?)'])]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:.conda-chrombioinf]", "language": "python", "name": "conda-env-.conda-chrombioinf-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.4" } }, "nbformat": 4, "nbformat_minor": 4 }