{ "metadata": { "name": "", "signature": "sha256:38eb30f6cf65431aa8bf63fbcb27ecbd45f56796fe805cc2a9594ae6b75dad31" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Identifying mCpG binding proteins in Crassostrea gigas proteome" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Approach 1: NCBI Biosytems & Blast" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"methyl-CpG_binding_-_BioSystems_-_NCBI_and_sr320_and_invert_review_17BE6662.png\"/" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!head /Volumes/web/cnidarian/BioSys_mCpGbind_492643.fasta" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ ">gi|528955085|ref|XP_005208771.1| PREDICTED: DNA (cytosine-5)-methyltransferase 1 isoform X5 [Bos taurus]\r\n", "MAEKGKPPKPVSRLYTPRRSKSDGETKSEVSSSPRITRKTTRQTTITSHFPRGPAKRKPEEEPEKVKSDD\r\n", "SVDEEKDQEEKRRRVTSRERVAGLLPAEEPGRVRPGTHMEEEGRDDKEEKRLRSQTKEPTPKHKAKEEPD\r\n", "RDVRPGGAQAEMNEGEDKDEKRHRSQPKDLASKRRPEEKEPERVKPQVSDEKDEDEKFWRIQFTYQSTSR\r\n", "EEKRRRTTYRELTEKKMTRTKIAVVSKTNPPKCTECLQYLDDPELRYEQHPPDAVEEIQILTNERLSIFD\r\n", "ANESGFESYEDLPQHKLTCFSVYCKRGHLCPIDTGLIEKDVELLFSGSAKPIYEDDPSPEGGINGKNFGP\r\n", "INEWWIAGFDGGEKALLGFSTSFAEYILMDPSPEYAPLFSVMQEKIYISKIVVEFLQSNPDSTYEDLINK\r\n", "IETTVPPCMLNLNRFTEDSLLRHAQFVVEQVESYDRAGDSDEQPIFLSPCMRDLIKLAGVTLGKRRAERR\r\n", "QTIRQPAKEKDKGPTKATTTKLVYQIFDTFFAEQIEKDDKEDKENAFKRRRCGVCEICQQPECGKCKACK\r\n", "DMVKFGGSGRSKQACQKRRCPNMAMKEADDDEEVDDNIPEMPSPKKMHQGKKKKQNKNRISWVGDAVKTD\r\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "!wc /Volumes/web/cnidarian/BioSys_mCpGbind_492643.fasta" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 4947 7661 316301 /Volumes/web/cnidarian/BioSys_mCpGbind_492643.fasta\r\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "!makeblastdb -in /Volumes/Bay3/Software/ncbi-blast-2.2.28\\+/db/oyster_v9p.fa -out /Volumes/Bay3/Software/ncbi-blast-2.2.28\\+/db/oyster_v9p -dbtype prot" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "\r\n", "Building a new DB, current time: 08/16/2013 07:28:01\r\n", "New DB name: /Volumes/Bay3/Software/ncbi-blast-2.2.28+/db/oyster_v9p\r\n", "New DB title: /Volumes/Bay3/Software/ncbi-blast-2.2.28+/db/oyster_v9p.fa\r\n", "Sequence type: Protein\r\n", "Keep Linkouts: T\r\n", "Keep MBits: T\r\n", "Maximum file size: 1000000000B\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Adding sequences from FASTA; added 28027 sequences in 1.79495 seconds.\r\n" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "!blastp -query /Volumes/web/cnidarian/BioSys_mCpGbind_492643.fasta -db /Volumes/Bay3/Software/ncbi-blast-2.2.28\\+/db/oyster_v9p -out /Volumes/web/cnidarian/mCpG_BP_blastp_v9_out.txt -outfmt 6 -max_target_seqs 10" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "!head /Volumes/web/cnidarian/mCpG_BP_blastp_v9_out.txt" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "gi|528955085|ref|XP_005208771.1|\tCGI_10021920\t61.43\t1006\t353\t15\t294\t1287\t1\t983\t0.0\t1202\r\n", "gi|528955085|ref|XP_005208771.1|\tCGI_10004707\t45.13\t113\t59\t2\t689\t798\t11\t123\t2e-22\t 103\r\n", "gi|528955085|ref|XP_005208771.1|\tCGI_10025673\t48.98\t49\t20\t1\t542\t585\t703\t751\t2e-07\t55.8\r\n", "gi|528955085|ref|XP_005208771.1|\tCGI_10020946\t25.71\t210\t120\t9\t1028\t1227\t1\t184\t7e-07\t53.1\r\n", "gi|528955085|ref|XP_005208771.1|\tCGI_10003574\t41.67\t60\t32\t2\t527\t585\t110\t167\t5e-06\t50.8\r\n", "gi|528955085|ref|XP_005208771.1|\tCGI_10021919\t46.88\t32\t17\t0\t234\t265\t239\t270\t0.96\t33.5\r\n", "gi|528955085|ref|XP_005208771.1|\tCGI_10013639\t25.00\t132\t92\t2\t5\t136\t1444\t1568\t1.3\t33.5\r\n", "gi|528955085|ref|XP_005208771.1|\tCGI_10002526\t22.58\t279\t186\t6\t1\t261\t153\t419\t5.8\t31.2\r\n", "gi|528955083|ref|XP_005208770.1|\tCGI_10021920\t61.43\t1006\t353\t15\t408\t1401\t1\t983\t0.0\t1202\r\n", "gi|528955083|ref|XP_005208770.1|\tCGI_10004707\t45.13\t113\t59\t2\t803\t912\t11\t123\t3e-22\t 103\r\n" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "!wc /Volumes/web/cnidarian/mCpG_BP_blastp_v9_out.txt" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 4694 56328 389245 /Volumes/web/cnidarian/mCpG_BP_blastp_v9_out.txt\r\n" ] } ], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "!blastp -query /Volumes/web/cnidarian/BioSys_mCpGbind_492643.fasta -db /Volumes/Bay3/Software/ncbi-blast-2.2.28\\+/db/oyster_v9p -out /Volumes/web/cnidarian/mCpG_BP_blastp_v9_out2.txt -evalue 1E-20 -outfmt 6 -max_target_seqs 10" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "!wc /Volumes/web/cnidarian/mCpG_BP_blastp_v9_out2.txt" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 365 4380 30402 /Volumes/web/cnidarian/mCpG_BP_blastp_v9_out2.txt\r\n" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "#13 oyster sequences identified with 1E-20 cutoff\n", "!head /Volumes/web/cnidarian/mCpG_BP_candidates.fa" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ ">CGI_10021920\r\n", "QHKITNFSVYDKNTHLCPFDTGLIEKNVFLYFSGVVKPIYDENSSPEGGI\r\n", "RACKMGPINEWWTAGFDGGENALIGFSTAYAEYILMSPSEAYKPYMDTMR\r\n", "EKIHMSKVVIEFMQNNQEATYEDLLNKIQTTVPPTGLSSLTEDSLLRHAQ\r\n", "FVLDQVQSYDEAAEEDEGLLITTPCMRALIKLAGVTLGKRRQMRKELRKT\r\n", "KDKVKKPAFTMATTTRLVTQIFDSLFQGEIDDKSGQGSKRRRCGICEICQ\r\n", "QPDCGKCTACKDMVKFGGSGKAKQACINRRCPNMAMKEADEDDILDDDDT\r\n", "DEKLETTKLSWVGDPVLQDGKNSYYSAVLINDEKVSFGDFISIKPEDVAI\r\n", "PVYIAMVNYLWENASGNKMCHVQWLCRGSDTILGETGDPLELFFVDDCES\r\n", "IKLESSLRKVKVLHKETSPDWFMQGGIEHPEKDFPIEDDSNTFYYQKWYD\r\n" ] } ], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "#Batch Web CD search\n", "!head /Volumes/web/cnidarian/mCpG_BP_candidates_CDhitdata.txt" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "#Batch CD-search tool\tNIH/NLM/NCBI\r\n", "#cdsid\tQM3-qcdsearch-1A5793D4DABAFA61-10538AEF94374D59\r\n", "#datatype\thits Concise data\r\n", "#status\t0\r\n", "#Start time\t2013-08-16T14:49:55\tRun time\t0:00:00:20\r\n", "#status\tsuccess\r\n", "\r\n", "Query\tHit type\tPSSM-ID\tFrom\tTo\tE-Value\tBitscore\tAccession\tShort name\tIncomplete\tSuperfamily\r\n", "Q#1 - >CGI_10021920\tspecific\t240107\t332\t457\t2.95408e-47\t165.713\tcd04760\tBAH_Dnmt1_I\t - \tcl02608\r\n", "Q#1 - >CGI_10021920\tsuperfamily\t243106\t332\t457\t2.95408e-47\t165.713\tcl02608\tBAH superfamily\t - \t - \r\n" ] } ], "prompt_number": 13 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Two hard matches \n", "CGI_10023379\n", "CGI_10011651\n", "\n", "CGI_10023379\n", "O95243\n", "3e-66\n", "Methyl-CpG-binding domain protein 4\n", "\n", "CGI_10011651\n", "Q9UBB5\n", "4e-77\n", "Methyl-CpG-binding domain protein 2\n", "\n", "\n" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "\n", "Approach 2: NCBI Batch CD search" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"Welcome_to_NCBI_Batch_CD-search_17BE6F70.png\"/" ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "All 28k Proteins were subjected to search " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"Welcome_to_NCBI_Batch_CD-search_17BF16AE.png\"/" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!head /Volumes/web/cnidarian/TJGR_CCD_feature.txt" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "#Batch CD-search tool\tNIH/NLM/NCBI\r\n", "#cdsid\tQM3-qcdsearch-1DCFE6033927026F-7DD9BF65B71B020B\r\n", "#datatype\tfeats\r\n", "#status\t0\r\n", "#Start time\t2013-08-16T14:31:22\tRun time\t0:11:44:47\r\n", "#status\tsuccess\r\n", "\r\n", "Query\tType\tTitle\tcoordinates\tcomplete size\tmapped size\tsource domain\r\n", "Q#2 - >CGI_10000456\tspecific\tactive site residues\tK116,K123\t2\t2\t238385\r\n", "Q#2 - >CGI_10000456\tspecific\tMoaE homodimer interface\tS17,T21,G26,I28,S29,I30,F31,V32,I34,R36,A90,P99,R101,E109,D113\t15\t15\t238385\r\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "!head /Volumes/web/cnidarian/TJGR_CCD_dom_con_S_define.txt" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "#Batch CD-search tool\tNIH/NLM/NCBI\r\n", "#cdsid\tQM3-qcdsearch-1DCFE6033927026F-5D508E148AAAD632\r\n", "#datatype\thits Concise data(Superfamily only)\r\n", "#status\t0\r\n", "#Start time\t2013-08-16T14:31:22\tRun time\t0:11:44:47\r\n", "#status\tsuccess\r\n", "\r\n", "Query\tHit type\tPSSM-ID\tFrom\tTo\tE-Value\tBitscore\tAccession\tShort name\tIncomplete\tSuperfamily\tDefinition\r\n", "Q#2 - >CGI_10000456\tsuperfamily\t241841\t11\t135\t8.33438e-46\t147.279\tcl00399\tMoaE superfamily\t - \t - \tMoaE family. Members of this family are involved in biosynthesis of the molybdenum cofactor (Moco), an essential cofactor for a diverse group of redox enzymes. Moco biosynthesis is an evolutionarily conserved pathway present in eubacteria, archaea and eukaryotes. Moco contains a tricyclic pyranopterin, termed molybdopterin (MPT), which carries the cis-dithiolene group responsible for molybdenum ligation. This dithiolene group is generated by MPT synthase in the second major step in Moco biosynthesis. MPT synthase is a heterotetramer consisting of two large (MoaE) and two small (MoaD) subunits.\r\n", "Q#4 - >CGI_10000774\tsuperfamily\t220249\t54\t121\t1.85274e-18\t74.564\tcl09695\tH_lectin superfamily\t - \t - \tH-type lectin domain; The H-type lectin domain is a unit of six beta chains, combined into a homo-hexamer. It is involved in self/non-self recognition of cells, through binding with carbohydrates. It is sometimes found in association with the F5_F8_type_C domain pfam00754.\r\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "!head /Volumes/web/cnidarian/TJGR_CCD_dom_con.txt" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "#Batch CD-search tool\tNIH/NLM/NCBI\r\n", "#cdsid\tQM3-qcdsearch-1DCFE6033927026F-517F715027090CD6\r\n", "#datatype\thits Concise data\r\n", "#status\t0\r\n", "#Start time\t2013-08-16T14:31:22\tRun time\t0:11:44:47\r\n", "#status\tsuccess\r\n", "\r\n", "Query\tHit type\tPSSM-ID\tFrom\tTo\tE-Value\tBitscore\tAccession\tShort name\tIncomplete\tSuperfamily\r\n", "Q#2 - >CGI_10000456\tspecific\t238385\t11\t135\t8.33438e-46\t147.279\tcd00756\tMoaE\t - \tcl00399\r\n", "Q#2 - >CGI_10000456\tsuperfamily\t241841\t11\t135\t8.33438e-46\t147.279\tcl00399\tMoaE superfamily\t - \t - \r\n" ] } ], "prompt_number": 3 }, { "cell_type": "markdown", "metadata": {}, "source": [ "File with definition uploaded to SQLShare \n", "\"SQLShare_-_View_Query_17BFC14D.png\"/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Query to find \"CpG\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "SELECT * \n", " FROM [sr320@washington.edu].[TJGR_CCD_domain_concise_Superfamily_def]\n", " where \n", " Definition like '%CpG%'\n", "``` " ] }, { "cell_type": "code", "collapsed": false, "input": [ "!wc /Volumes/web/cnidarian/TJGR_CpG_domain.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 18 1699 13383 /Volumes/web/cnidarian/TJGR_CpG_domain.csv\r\n" ] } ], "prompt_number": 6 }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"Screenshot_8_17_13_8_40_AM_17BFD0E5.png\"/" ] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }