{
"metadata": {
"name": "",
"signature": "sha256:38eb30f6cf65431aa8bf63fbcb27ecbd45f56796fe805cc2a9594ae6b75dad31"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Identifying mCpG binding proteins in Crassostrea gigas proteome"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Approach 1: NCBI Biosytems & Blast"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!head /Volumes/web/cnidarian/BioSys_mCpGbind_492643.fasta"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
">gi|528955085|ref|XP_005208771.1| PREDICTED: DNA (cytosine-5)-methyltransferase 1 isoform X5 [Bos taurus]\r\n",
"MAEKGKPPKPVSRLYTPRRSKSDGETKSEVSSSPRITRKTTRQTTITSHFPRGPAKRKPEEEPEKVKSDD\r\n",
"SVDEEKDQEEKRRRVTSRERVAGLLPAEEPGRVRPGTHMEEEGRDDKEEKRLRSQTKEPTPKHKAKEEPD\r\n",
"RDVRPGGAQAEMNEGEDKDEKRHRSQPKDLASKRRPEEKEPERVKPQVSDEKDEDEKFWRIQFTYQSTSR\r\n",
"EEKRRRTTYRELTEKKMTRTKIAVVSKTNPPKCTECLQYLDDPELRYEQHPPDAVEEIQILTNERLSIFD\r\n",
"ANESGFESYEDLPQHKLTCFSVYCKRGHLCPIDTGLIEKDVELLFSGSAKPIYEDDPSPEGGINGKNFGP\r\n",
"INEWWIAGFDGGEKALLGFSTSFAEYILMDPSPEYAPLFSVMQEKIYISKIVVEFLQSNPDSTYEDLINK\r\n",
"IETTVPPCMLNLNRFTEDSLLRHAQFVVEQVESYDRAGDSDEQPIFLSPCMRDLIKLAGVTLGKRRAERR\r\n",
"QTIRQPAKEKDKGPTKATTTKLVYQIFDTFFAEQIEKDDKEDKENAFKRRRCGVCEICQQPECGKCKACK\r\n",
"DMVKFGGSGRSKQACQKRRCPNMAMKEADDDEEVDDNIPEMPSPKKMHQGKKKKQNKNRISWVGDAVKTD\r\n"
]
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!wc /Volumes/web/cnidarian/BioSys_mCpGbind_492643.fasta"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 4947 7661 316301 /Volumes/web/cnidarian/BioSys_mCpGbind_492643.fasta\r\n"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!makeblastdb -in /Volumes/Bay3/Software/ncbi-blast-2.2.28\\+/db/oyster_v9p.fa -out /Volumes/Bay3/Software/ncbi-blast-2.2.28\\+/db/oyster_v9p -dbtype prot"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"\r\n",
"Building a new DB, current time: 08/16/2013 07:28:01\r\n",
"New DB name: /Volumes/Bay3/Software/ncbi-blast-2.2.28+/db/oyster_v9p\r\n",
"New DB title: /Volumes/Bay3/Software/ncbi-blast-2.2.28+/db/oyster_v9p.fa\r\n",
"Sequence type: Protein\r\n",
"Keep Linkouts: T\r\n",
"Keep MBits: T\r\n",
"Maximum file size: 1000000000B\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Adding sequences from FASTA; added 28027 sequences in 1.79495 seconds.\r\n"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!blastp -query /Volumes/web/cnidarian/BioSys_mCpGbind_492643.fasta -db /Volumes/Bay3/Software/ncbi-blast-2.2.28\\+/db/oyster_v9p -out /Volumes/web/cnidarian/mCpG_BP_blastp_v9_out.txt -outfmt 6 -max_target_seqs 10"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!head /Volumes/web/cnidarian/mCpG_BP_blastp_v9_out.txt"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"gi|528955085|ref|XP_005208771.1|\tCGI_10021920\t61.43\t1006\t353\t15\t294\t1287\t1\t983\t0.0\t1202\r\n",
"gi|528955085|ref|XP_005208771.1|\tCGI_10004707\t45.13\t113\t59\t2\t689\t798\t11\t123\t2e-22\t 103\r\n",
"gi|528955085|ref|XP_005208771.1|\tCGI_10025673\t48.98\t49\t20\t1\t542\t585\t703\t751\t2e-07\t55.8\r\n",
"gi|528955085|ref|XP_005208771.1|\tCGI_10020946\t25.71\t210\t120\t9\t1028\t1227\t1\t184\t7e-07\t53.1\r\n",
"gi|528955085|ref|XP_005208771.1|\tCGI_10003574\t41.67\t60\t32\t2\t527\t585\t110\t167\t5e-06\t50.8\r\n",
"gi|528955085|ref|XP_005208771.1|\tCGI_10021919\t46.88\t32\t17\t0\t234\t265\t239\t270\t0.96\t33.5\r\n",
"gi|528955085|ref|XP_005208771.1|\tCGI_10013639\t25.00\t132\t92\t2\t5\t136\t1444\t1568\t1.3\t33.5\r\n",
"gi|528955085|ref|XP_005208771.1|\tCGI_10002526\t22.58\t279\t186\t6\t1\t261\t153\t419\t5.8\t31.2\r\n",
"gi|528955083|ref|XP_005208770.1|\tCGI_10021920\t61.43\t1006\t353\t15\t408\t1401\t1\t983\t0.0\t1202\r\n",
"gi|528955083|ref|XP_005208770.1|\tCGI_10004707\t45.13\t113\t59\t2\t803\t912\t11\t123\t3e-22\t 103\r\n"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!wc /Volumes/web/cnidarian/mCpG_BP_blastp_v9_out.txt"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 4694 56328 389245 /Volumes/web/cnidarian/mCpG_BP_blastp_v9_out.txt\r\n"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!blastp -query /Volumes/web/cnidarian/BioSys_mCpGbind_492643.fasta -db /Volumes/Bay3/Software/ncbi-blast-2.2.28\\+/db/oyster_v9p -out /Volumes/web/cnidarian/mCpG_BP_blastp_v9_out2.txt -evalue 1E-20 -outfmt 6 -max_target_seqs 10"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!wc /Volumes/web/cnidarian/mCpG_BP_blastp_v9_out2.txt"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 365 4380 30402 /Volumes/web/cnidarian/mCpG_BP_blastp_v9_out2.txt\r\n"
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#13 oyster sequences identified with 1E-20 cutoff\n",
"!head /Volumes/web/cnidarian/mCpG_BP_candidates.fa"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
">CGI_10021920\r\n",
"QHKITNFSVYDKNTHLCPFDTGLIEKNVFLYFSGVVKPIYDENSSPEGGI\r\n",
"RACKMGPINEWWTAGFDGGENALIGFSTAYAEYILMSPSEAYKPYMDTMR\r\n",
"EKIHMSKVVIEFMQNNQEATYEDLLNKIQTTVPPTGLSSLTEDSLLRHAQ\r\n",
"FVLDQVQSYDEAAEEDEGLLITTPCMRALIKLAGVTLGKRRQMRKELRKT\r\n",
"KDKVKKPAFTMATTTRLVTQIFDSLFQGEIDDKSGQGSKRRRCGICEICQ\r\n",
"QPDCGKCTACKDMVKFGGSGKAKQACINRRCPNMAMKEADEDDILDDDDT\r\n",
"DEKLETTKLSWVGDPVLQDGKNSYYSAVLINDEKVSFGDFISIKPEDVAI\r\n",
"PVYIAMVNYLWENASGNKMCHVQWLCRGSDTILGETGDPLELFFVDDCES\r\n",
"IKLESSLRKVKVLHKETSPDWFMQGGIEHPEKDFPIEDDSNTFYYQKWYD\r\n"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Batch Web CD search\n",
"!head /Volumes/web/cnidarian/mCpG_BP_candidates_CDhitdata.txt"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"#Batch CD-search tool\tNIH/NLM/NCBI\r\n",
"#cdsid\tQM3-qcdsearch-1A5793D4DABAFA61-10538AEF94374D59\r\n",
"#datatype\thits Concise data\r\n",
"#status\t0\r\n",
"#Start time\t2013-08-16T14:49:55\tRun time\t0:00:00:20\r\n",
"#status\tsuccess\r\n",
"\r\n",
"Query\tHit type\tPSSM-ID\tFrom\tTo\tE-Value\tBitscore\tAccession\tShort name\tIncomplete\tSuperfamily\r\n",
"Q#1 - >CGI_10021920\tspecific\t240107\t332\t457\t2.95408e-47\t165.713\tcd04760\tBAH_Dnmt1_I\t - \tcl02608\r\n",
"Q#1 - >CGI_10021920\tsuperfamily\t243106\t332\t457\t2.95408e-47\t165.713\tcl02608\tBAH superfamily\t - \t - \r\n"
]
}
],
"prompt_number": 13
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Two hard matches \n",
"CGI_10023379\n",
"CGI_10011651\n",
"\n",
"CGI_10023379\n",
"O95243\n",
"3e-66\n",
"Methyl-CpG-binding domain protein 4\n",
"\n",
"CGI_10011651\n",
"Q9UBB5\n",
"4e-77\n",
"Methyl-CpG-binding domain protein 2\n",
"\n",
"\n"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"\n",
"Approach 2: NCBI Batch CD search"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"All 28k Proteins were subjected to search "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!head /Volumes/web/cnidarian/TJGR_CCD_feature.txt"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"#Batch CD-search tool\tNIH/NLM/NCBI\r\n",
"#cdsid\tQM3-qcdsearch-1DCFE6033927026F-7DD9BF65B71B020B\r\n",
"#datatype\tfeats\r\n",
"#status\t0\r\n",
"#Start time\t2013-08-16T14:31:22\tRun time\t0:11:44:47\r\n",
"#status\tsuccess\r\n",
"\r\n",
"Query\tType\tTitle\tcoordinates\tcomplete size\tmapped size\tsource domain\r\n",
"Q#2 - >CGI_10000456\tspecific\tactive site residues\tK116,K123\t2\t2\t238385\r\n",
"Q#2 - >CGI_10000456\tspecific\tMoaE homodimer interface\tS17,T21,G26,I28,S29,I30,F31,V32,I34,R36,A90,P99,R101,E109,D113\t15\t15\t238385\r\n"
]
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!head /Volumes/web/cnidarian/TJGR_CCD_dom_con_S_define.txt"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"#Batch CD-search tool\tNIH/NLM/NCBI\r\n",
"#cdsid\tQM3-qcdsearch-1DCFE6033927026F-5D508E148AAAD632\r\n",
"#datatype\thits Concise data(Superfamily only)\r\n",
"#status\t0\r\n",
"#Start time\t2013-08-16T14:31:22\tRun time\t0:11:44:47\r\n",
"#status\tsuccess\r\n",
"\r\n",
"Query\tHit type\tPSSM-ID\tFrom\tTo\tE-Value\tBitscore\tAccession\tShort name\tIncomplete\tSuperfamily\tDefinition\r\n",
"Q#2 - >CGI_10000456\tsuperfamily\t241841\t11\t135\t8.33438e-46\t147.279\tcl00399\tMoaE superfamily\t - \t - \tMoaE family. Members of this family are involved in biosynthesis of the molybdenum cofactor (Moco), an essential cofactor for a diverse group of redox enzymes. Moco biosynthesis is an evolutionarily conserved pathway present in eubacteria, archaea and eukaryotes. Moco contains a tricyclic pyranopterin, termed molybdopterin (MPT), which carries the cis-dithiolene group responsible for molybdenum ligation. This dithiolene group is generated by MPT synthase in the second major step in Moco biosynthesis. MPT synthase is a heterotetramer consisting of two large (MoaE) and two small (MoaD) subunits.\r\n",
"Q#4 - >CGI_10000774\tsuperfamily\t220249\t54\t121\t1.85274e-18\t74.564\tcl09695\tH_lectin superfamily\t - \t - \tH-type lectin domain; The H-type lectin domain is a unit of six beta chains, combined into a homo-hexamer. It is involved in self/non-self recognition of cells, through binding with carbohydrates. It is sometimes found in association with the F5_F8_type_C domain pfam00754.\r\n"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!head /Volumes/web/cnidarian/TJGR_CCD_dom_con.txt"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"#Batch CD-search tool\tNIH/NLM/NCBI\r\n",
"#cdsid\tQM3-qcdsearch-1DCFE6033927026F-517F715027090CD6\r\n",
"#datatype\thits Concise data\r\n",
"#status\t0\r\n",
"#Start time\t2013-08-16T14:31:22\tRun time\t0:11:44:47\r\n",
"#status\tsuccess\r\n",
"\r\n",
"Query\tHit type\tPSSM-ID\tFrom\tTo\tE-Value\tBitscore\tAccession\tShort name\tIncomplete\tSuperfamily\r\n",
"Q#2 - >CGI_10000456\tspecific\t238385\t11\t135\t8.33438e-46\t147.279\tcd00756\tMoaE\t - \tcl00399\r\n",
"Q#2 - >CGI_10000456\tsuperfamily\t241841\t11\t135\t8.33438e-46\t147.279\tcl00399\tMoaE superfamily\t - \t - \r\n"
]
}
],
"prompt_number": 3
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"File with definition uploaded to SQLShare \n",
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Query to find \"CpG\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```\n",
"SELECT * \n",
" FROM [sr320@washington.edu].[TJGR_CCD_domain_concise_Superfamily_def]\n",
" where \n",
" Definition like '%CpG%'\n",
"``` "
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!wc /Volumes/web/cnidarian/TJGR_CpG_domain.csv"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 18 1699 13383 /Volumes/web/cnidarian/TJGR_CpG_domain.csv\r\n"
]
}
],
"prompt_number": 6
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}