{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Wed Oct 23 11:08:28 PDT 2019\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.6 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "swoose\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 24\n", "On-line CPU(s) list: 0-23\n", "Thread(s) per core: 2\n", "Core(s) per socket: 6\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 44\n", "Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz\n", "Stepping: 2\n", "CPU MHz: 2925.798\n", "BogoMIPS: 5851.95\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 12288K\n", "NUMA node0 CPU(s): 0-23\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 70G 13G 37G 602M 19G 55G\n", "Swap: 4.7G 1.0G 3.6G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set variables\n", "`%env` variables are good for passing to bash cells" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: wd=/home/sam/analyses/20191022_pgen_Pgen_v074.a4_BLAST-GO-mapping\n", "env: rsync_gannet=gannet:/volume2/web/Atumefaciens/20190928_Pgenerosa_v074.a4_gensas_annotation/Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab\n", "env: blast_tab=Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab\n", "env: sp_id_list=20191022_Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional_sp_id.list\n", "env: sp_id_go_mapping_tab=20191022_SP-ID-GO-mapping.tab\n", "env: blast_go_join=20191022_Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional_sp_id-join-go_id.tab\n" ] } ], "source": [ "%env wd=/home/sam/analyses/20191022_pgen_Pgen_v074.a4_BLAST-GO-mapping\n", "wd=\"/home/sam/analyses/20191022_pgen_Pgen_v074.a4_BLAST-GO-mapping\"\n", "\n", "%env rsync_gannet=gannet:/volume2/web/Atumefaciens/20190928_Pgenerosa_v074.a4_gensas_annotation/Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab\n", "%env blast_tab=Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab\n", "%env sp_id_list=20191022_Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional_sp_id.list\n", "%env sp_id_go_mapping_tab=20191022_SP-ID-GO-mapping.tab\n", "%env blast_go_join=20191022_Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional_sp_id-join-go_id.tab\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### If directory doesn't exist, make it." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "[ -d \"${wd}\" ] || mkdir --parents \"${wd}\"" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/sam/analyses/20191022_pgen_Pgen_v074.a4_BLAST-GO-mapping\n" ] } ], "source": [ "cd {wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Download file\n", "\n", "#### Use `wget https://gannet.fish.washington.edu/Atumefaciens/20190928_Pgenerosa_v074.a4_gensas_annotation/Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab` if you don't have `rsync` capabilities." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab\n", "\n", "sent 30 bytes received 1,533,721 bytes 1,022,500.67 bytes/sec\n", "total size is 1,533,389 speedup is 1.00\n", "--------------------------------------\n", "total 1.5M\n", "-rw-rw-rw- 1 sam users 1.5M Oct 3 11:18 Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab\n" ] } ], "source": [ "%%bash\n", "rsync \\\n", "--archive \\\n", "--verbose \\\n", "\"${rsync_gannet}\" .\n", "\n", "echo \"--------------------------------------\"\n", "ls -lh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check out the file.\n", "#### Confirm removal of first 13 lines removes header\n", "#### Count number of annotated mRNAs in file" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#\n", "# Output is generated by GenSAS 7.x-5.0\n", "#\n", "#name : mRNA\n", "#start : Start of alignment in subject\n", "#end : End of alignment in subject\n", "#m_start : Start of alignment in query\n", "#m_end : End of alignment in query\n", "#al : Alignment length\n", "#score : Row score of the match\n", "#evalue : E value of the match\n", "#identity : Percentage of identical matches\n", "mame\tstart\tend\tscore\tAccession\tMatch ID\tm_start\tm_end\tE-value\tidentity\tal\n", "21910-PGEN_.00g000010.m01\t121\t229\t165\tQ86IC9\tsp|Q86IC9|CAMT1_DICDI\t11\t122\t8.93e-14\t35.652\t115\n", "21910-PGEN_.00g000020.m01\t147\t467\t968\tP04177\tsp|P04177|TY3H_RAT\t20\t339\t3.47e-127\t55.140\t321\n", "21910-PGEN_.00g000050.m01\t566\t722\t182\tQ8L840\tsp|Q8L840|RQL4A_ARATH\t2\t167\t2.67e-14\t35.119\t168\n", "21910-PGEN_.00g000080.m01\t268\t322\t152\tA1E2V0\tsp|A1E2V0|BIRC3_CANLF\t163\t220\t3.91e-10\t53.448\t58\n", "21910-PGEN_.00g000090.m01\t199\t327\t161\tP34456\tsp|P34456|YMD2_CAEEL\t7\t134\t7.52e-12\t26.357\t129\n", "21910-PGEN_.00g000210.m01\t18\t200\t263\tO00463\tsp|O00463|TRAF5_HUMAN\t5\t191\t2.24e-25\t34.921\t189\n", "21910-PGEN_.00g000230.m01\t48\t155\t287\tQ00945\tsp|Q00945|CONO_LYMST\t31\t134\t1.59e-32\t50.000\t108\n", "21910-PGEN_.00g000240.m01\t4\t605\t1091\tQ5SWK7\tsp|Q5SWK7|RN145_MOUSE\t13\t601\t2.65e-139\t39.607\t611\n", "21910-PGEN_.00g000280.m01\t4\t153\t210\tQ8ZXT3\tsp|Q8ZXT3|Y1111_PYRAE\t853\t1012\t1.10e-17\t38.750\t160\n", "21910-PGEN_.00g000300.m01\t159\t347\t480\tQ5REG4\tsp|Q5REG4|DTX3_PONAB\t1135\t1320\t1.20e-51\t50.794\t189\n", "21910-PGEN_.00g000300.m02\t159\t347\t480\tQ5REG4\tsp|Q5REG4|DTX3_PONAB\t1138\t1323\t1.18e-51\t50.794\t189\n", "21910-PGEN_.00g000380.m01\t381\t508\t205\tQ8QG60\tsp|Q8QG60|CRY2_CHICK\t2\t145\t4.92e-18\t36.111\t144\n", "\n", "----------------------------\n", "21910-PGEN_.00g000010.m01\t121\t229\t165\tQ86IC9\tsp|Q86IC9|CAMT1_DICDI\t11\t122\t8.93e-14\t35.652\t115\n", "21910-PGEN_.00g000020.m01\t147\t467\t968\tP04177\tsp|P04177|TY3H_RAT\t20\t339\t3.47e-127\t55.140\t321\n", "21910-PGEN_.00g000050.m01\t566\t722\t182\tQ8L840\tsp|Q8L840|RQL4A_ARATH\t2\t167\t2.67e-14\t35.119\t168\n", "21910-PGEN_.00g000080.m01\t268\t322\t152\tA1E2V0\tsp|A1E2V0|BIRC3_CANLF\t163\t220\t3.91e-10\t53.448\t58\n", "21910-PGEN_.00g000090.m01\t199\t327\t161\tP34456\tsp|P34456|YMD2_CAEEL\t7\t134\t7.52e-12\t26.357\t129\n", "21910-PGEN_.00g000210.m01\t18\t200\t263\tO00463\tsp|O00463|TRAF5_HUMAN\t5\t191\t2.24e-25\t34.921\t189\n", "21910-PGEN_.00g000230.m01\t48\t155\t287\tQ00945\tsp|Q00945|CONO_LYMST\t31\t134\t1.59e-32\t50.000\t108\n", "21910-PGEN_.00g000240.m01\t4\t605\t1091\tQ5SWK7\tsp|Q5SWK7|RN145_MOUSE\t13\t601\t2.65e-139\t39.607\t611\n", "21910-PGEN_.00g000280.m01\t4\t153\t210\tQ8ZXT3\tsp|Q8ZXT3|Y1111_PYRAE\t853\t1012\t1.10e-17\t38.750\t160\n", "21910-PGEN_.00g000300.m01\t159\t347\t480\tQ5REG4\tsp|Q5REG4|DTX3_PONAB\t1135\t1320\t1.20e-51\t50.794\t189\n", "\n", "\n", "----------------------------\n", "16548\n" ] } ], "source": [ "%%bash\n", "head -n 25 \"${blast_tab}\"\n", "echo \"\"\n", "echo \"----------------------------\"\n", "tail -n +14 \"${blast_tab}\" | head\n", "echo \"\"\n", "echo \"\"\n", "echo \"----------------------------\"\n", "tail -n +14 \"${blast_tab}\" | wc -l" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create list of SwissProt IDs to submit to UniProt mapping website:\n", "https://www.uniprot.org/uploadlists\n", "\n", "#### Check number of unique entries\n", "\n", "The UniProt mapping website only generates matches for each unique ID." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "21910-PGEN_.00g000010.m01\t121\t229\t165\tQ86IC9\tsp|Q86IC9|CAMT1_DICDI\t11\t122\t8.93e-14\t35.652\t115\n", "21910-PGEN_.00g000020.m01\t147\t467\t968\tP04177\tsp|P04177|TY3H_RAT\t20\t339\t3.47e-127\t55.140\t321\n", "21910-PGEN_.00g000050.m01\t566\t722\t182\tQ8L840\tsp|Q8L840|RQL4A_ARATH\t2\t167\t2.67e-14\t35.119\t168\n", "21910-PGEN_.00g000080.m01\t268\t322\t152\tA1E2V0\tsp|A1E2V0|BIRC3_CANLF\t163\t220\t3.91e-10\t53.448\t58\n", "21910-PGEN_.00g000090.m01\t199\t327\t161\tP34456\tsp|P34456|YMD2_CAEEL\t7\t134\t7.52e-12\t26.357\t129\n", "21910-PGEN_.00g000210.m01\t18\t200\t263\tO00463\tsp|O00463|TRAF5_HUMAN\t5\t191\t2.24e-25\t34.921\t189\n", "21910-PGEN_.00g000230.m01\t48\t155\t287\tQ00945\tsp|Q00945|CONO_LYMST\t31\t134\t1.59e-32\t50.000\t108\n", "21910-PGEN_.00g000240.m01\t4\t605\t1091\tQ5SWK7\tsp|Q5SWK7|RN145_MOUSE\t13\t601\t2.65e-139\t39.607\t611\n", "21910-PGEN_.00g000280.m01\t4\t153\t210\tQ8ZXT3\tsp|Q8ZXT3|Y1111_PYRAE\t853\t1012\t1.10e-17\t38.750\t160\n", "21910-PGEN_.00g000300.m01\t159\t347\t480\tQ5REG4\tsp|Q5REG4|DTX3_PONAB\t1135\t1320\t1.20e-51\t50.794\t189\n", "\n", "16548\n" ] } ], "source": [ "%%bash\n", "tail -n +14 \"${blast_tab}\" > \"${sp_id_list}\"\n", "head \"${sp_id_list}\"\n", "echo \"\"\n", "sort \"${sp_id_list}\" | uniq -c | wc -l" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check out UniProt GO ID mapping file\n", "\n", "Line count below should match the `uniq` line count from the cell above" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Entry\tyourlist:M20191022216DA2B77BFBD2E6699CA9B6D1C41EB24D59497\tGene ontology IDs\n", "Q86IC9\tQ86IC9\tGO:0042409; GO:0046872\n", "P04177\tP04177\tGO:0001666; GO:0001963; GO:0001975; GO:0004497; GO:0004511; GO:0005634; GO:0005737; GO:0005739; GO:0005790; GO:0005829; GO:0006585; GO:0006631; GO:0006665; GO:0007507; GO:0007601; GO:0007605; GO:0007612; GO:0007613; GO:0007617; GO:0007626; GO:0008016; GO:0008021; GO:0008198; GO:0008199; GO:0009414; GO:0009416; GO:0009635; GO:0009651; GO:0009898; GO:0010038; GO:0010043; GO:0010259; GO:0014070; GO:0014823; GO:0015842; GO:0016137; GO:0016597; GO:0017085; GO:0018963; GO:0019825; GO:0019899; GO:0019904; GO:0021987; GO:0030424; GO:0030425; GO:0030659; GO:0031667; GO:0032355; GO:0032496; GO:0033076; GO:0033162; GO:0034617; GO:0035094; GO:0035176; GO:0035240; GO:0035690; GO:0035900; GO:0035902; GO:0042136; GO:0042214; GO:0042418; GO:0042421; GO:0042423; GO:0042462; GO:0042493; GO:0042745; GO:0042755; GO:0043005; GO:0043025; GO:0043195; GO:0043204; GO:0043434; GO:0045471; GO:0045472; GO:0046684; GO:0048545; GO:0048596; GO:0050890; GO:0051289; GO:0051412; GO:0051602; GO:0052314; GO:0070848; GO:0071287; GO:0071312; GO:0071316; GO:0071333; GO:0071363\n", "Q8L840\tQ8L840\tGO:0000724; GO:0003676; GO:0005524; GO:0005634; GO:0005694; GO:0005737; GO:0006268; GO:0006281; GO:0006310; GO:0006974; GO:0009378; GO:0009506; GO:0032508; GO:0043138; GO:0046872; GO:0051276; GO:0070417; GO:0071215\n", "A1E2V0\tA1E2V0\tGO:0005634; GO:0005737; GO:0031398; GO:0043027; GO:0043154; GO:0046872; GO:0060546; GO:0061630; GO:1990001\n", "P34456\tP34456\tGO:0004748; GO:0005829; GO:0005971; GO:0009263\n", "O00463\tO00463\tGO:0005164; GO:0005813; GO:0005829; GO:0006915; GO:0007165; GO:0008270; GO:0009898; GO:0031625; GO:0031996; GO:0032991; GO:0035631; GO:0042802; GO:0042981; GO:0043123; GO:0051091; GO:0051092\n", "Q00945\tQ00945\tGO:0005185; GO:0005576\n", "Q5SWK7\tQ5SWK7\tGO:0005783; GO:0005789; GO:0008270; GO:0012505; GO:0016021; GO:0061630\n", "Q8ZXT3\tQ8ZXT3\t\n", "\n", "10157\n" ] } ], "source": [ "%%bash\n", "head \"${sp_id_go_mapping_tab}\"\n", "echo \"\"\n", "tail -n +2 \"${sp_id_go_mapping_tab}\" | wc -l" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Join th two files, after sorting each" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0A044RE18 21910-PGEN_.00g298100.m01 195 626 542 sp|A0A044RE18|BLI_ONCVO 99 542 7.89e-57 32.972 461 A0A044RE18 GO:0004252; GO:0005576; GO:0005634; GO:0007635; GO:0031638; GO:0040002; GO:0045887; GO:0046872; GO:0090472; GO:1902075\n", "A0A044RE18 21910-PGEN_.00g298510.m01 195 626 539 sp|A0A044RE18|BLI_ONCVO 135 578 2.66e-56 32.829 463 A0A044RE18 GO:0004252; GO:0005576; GO:0005634; GO:0007635; GO:0031638; GO:0040002; GO:0045887; GO:0046872; GO:0090472; GO:1902075\n", "A0A0A7DNP6 21910-PGEN_.00g047160.m01 5 94 284 sp|A0A0A7DNP6|GRHLP_RUDPH 9 96 1.66e-33 56.667 90 A0A0A7DNP6 GO:0005576; GO:0007218\n", "A0A0B5A7M7 21910-PGEN_.00g059020.m01 26 149 135 sp|A0A0B5A7M7|INS1_CONIM 59 178 5.90e-09 29.927 137 A0A0B5A7M7 GO:0005179; GO:0005576; GO:0006006; GO:0090729\n", "A0A0B5A8P8 21910-PGEN_.00g272560.m01 33 140 164 sp|A0A0B5A8P8|INS2_CONIM 27 140 7.44e-14 31.707 123 A0A0B5A8P8 GO:0005179; GO:0005576; GO:0006006; GO:0090729\n", "A0A0E0RTV6 21910-PGEN_.00g274950.m01 119 292 258 sp|A0A0E0RTV6|ZEB1_GIBZE 126 311 1.75e-22 38.298 188 A0A0E0RTV6 GO:0016491; GO:0071949\n", "A0A0F7YYX3 21910-PGEN_.00g031120.m01 12 193 197 sp|A0A0F7YYX3|CPROH_CONVC 28 213 1.19e-17 31.720 186 A0A0F7YYX3 GO:0005576\n", "A0A0F7YZI5 21910-PGEN_.00g015590.m01 22 135 327 sp|A0A0F7YZI5|CTHB5_CONVC 56 168 2.42e-38 54.783 115 A0A0F7YZI5 GO:0005179; GO:0005576; GO:0090729\n", "A0A0F7Z3J2 21910-PGEN_.00g015600.m01 45 143 368 sp|A0A0F7Z3J2|CTHA2_CONVC 72 170 2.24e-44 68.687 99 A0A0F7Z3J2 GO:0005179; GO:0005576; GO:0090729\n", "A0A0G2K344 21910-PGEN_.00g320520.m01 5 1063 3021 sp|A0A0G2K344|PK3CA_RAT 3 1049 0.0 54.229 1064 A0A0G2K344 GO:0001525; GO:0004674; GO:0005524; GO:0005737; GO:0005829; GO:0005886; GO:0005942; GO:0005943; GO:0006909; GO:0010629; GO:0014065; GO:0014870; GO:0016020; GO:0016303; GO:0035005; GO:0036092; GO:0043201; GO:0046854; GO:0046934; GO:0048015; GO:0048661; GO:0052812; GO:0071548; GO:1903544\n", "\n", "---------------------------------------------\n", "\n", "16548 20191022_Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional_sp_id-join-go_id.tab\n" ] } ], "source": [ "%%bash\n", "join -1 5 -2 1 \\\n", "<(tail -n +14 \"${blast_tab}\" | sort -t$'\\t' -k5,5) \\\n", "<(awk -F\"\\t\" 'FNR>1 {print $2, $1, $3}' \"${sp_id_go_mapping_tab}\" | sort -k1,1) \\\n", "> \"${blast_go_join}\"\n", "\n", "# Check out new file\n", "head \"${blast_go_join}\"\n", "echo \"\"\n", "echo \"---------------------------------------------\"\n", "echo \"\"\n", "# Line count of new file (should match $sp_id_list line count)\n", "wc -l \"${blast_go_join}\"" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }