{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Notebook to document creation of new Uniprot GO:GOslim file" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Files\n", "\n", "##### All files were downloaded 20190423\n", "\n", "GOslim Generic File: \n", "\n", "- http://current.geneontology.org/ontology/subsets/goslim_generic.obo\n", "\n", "UniProt GeneOntology Annotation File:\n", "\n", "- http://current.geneontology.org/annotations/goa_uniprot_all.gaf.gz\n", "\n", " - File format description: http://geneontology.org/docs/go-annotation-file-gaf-format-2.1/" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 99G\n", "-rw-rw-r-- 1 sam sam 1.3K Apr 24 12:29 20190424_swoose_uniprot_go_goslim.ipynb\n", "-rw-rw-r-- 1 sam sam 99G Apr 23 11:19 goa_uniprot_all.gaf\n", "-rw-rw-r-- 1 sam sam 129K Apr 18 06:33 goslim_generic.obo\n" ] } ], "source": [ "%%bash\n", "ls -lh" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "!gaf-version: 2.1\n", "!\n", "!This file contains all GO annotations and gene product information for proteins in the UniProt KnowledgeBase (UniProtKB),\n", "!ComplexPortal protein complexes, and RNAcentral identifiers.\n", "!\n", "!Generated: 2019-04-08 11:50 \n", "!GO-version: http://purl.obolibrary.org/obo/go/releases/2019-03-29/extensions/go-plus.owl\n", "!\n", "UniProtKB\tA0A000\tmoeA5\t\tGO:0003824\tGO_REF:0000002\tIEA\tInterPro:IPR015421|InterPro:IPR015422\tF\tMoeA5\tmoeA5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A000\tmoeA5\t\tGO:0003870\tGO_REF:0000002\tIEA\tInterPro:IPR010961\tF\tMoeA5\tmoeA5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A000\tmoeA5\t\tGO:0009058\tGO_REF:0000002\tIEA\tInterPro:IPR004839\tP\tMoeA5\tmoeA5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A000\tmoeA5\t\tGO:0030170\tGO_REF:0000002\tIEA\tInterPro:IPR004839|InterPro:IPR010961\tF\tMoeA5\tmoeA5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A000\tmoeA5\t\tGO:0033014\tGO_REF:0000002\tIEA\tInterPro:IPR010961\tP\tMoeA5\tmoeA5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A001\tmoeD5\t\tGO:0000166\tGO_REF:0000038\tIEA\tUniProtKB-KW:KW-0547\tF\tMoeD5\tmoeD5\tprotein\ttaxon:67581\t20190406\tUniProt\t\t\n", "UniProtKB\tA0A001\tmoeD5\t\tGO:0000166\tGO_REF:0000104\tIEA\tUniRule:UR000400038\tF\tMoeD5\tmoeD5\tprotein\ttaxon:67581\t20190406\tUniProt\t\t\n", "UniProtKB\tA0A001\tmoeD5\t\tGO:0005524\tGO_REF:0000002\tIEA\tInterPro:IPR003439|InterPro:IPR011527|InterPro:IPR017871|InterPro:IPR036640\tF\tMoeD5\tmoeD5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A001\tmoeD5\t\tGO:0005524\tGO_REF:0000038\tIEA\tUniProtKB-KW:KW-0067\tF\tMoeD5\tmoeD5\tprotein\ttaxon:67581\t20190406\tUniProt\t\t\n", "UniProtKB\tA0A001\tmoeD5\t\tGO:0005524\tGO_REF:0000104\tIEA\tUniRule:UR000400038\tF\tMoeD5\tmoeD5\tprotein\ttaxon:67581\t20190406\tUniProt\t\t\n", "UniProtKB\tA0A001\tmoeD5\t\tGO:0016020\tGO_REF:0000038\tIEA\tUniProtKB-KW:KW-0472\tC\tMoeD5\tmoeD5\tprotein\ttaxon:67581\t20190406\tUniProt\t\t\n", "UniProtKB\tA0A001\tmoeD5\t\tGO:0016021\tGO_REF:0000002\tIEA\tInterPro:IPR011527|InterPro:IPR036640\tC\tMoeD5\tmoeD5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A001\tmoeD5\t\tGO:0016021\tGO_REF:0000038\tIEA\tUniProtKB-KW:KW-0812\tC\tMoeD5\tmoeD5\tprotein\ttaxon:67581\t20190406\tUniProt\t\t\n", "UniProtKB\tA0A001\tmoeD5\t\tGO:0016887\tGO_REF:0000002\tIEA\tInterPro:IPR003439|InterPro:IPR017871\tF\tMoeD5\tmoeD5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A001\tmoeD5\t\tGO:0042626\tGO_REF:0000002\tIEA\tInterPro:IPR011527\tF\tMoeD5\tmoeD5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A001\tmoeD5\t\tGO:0055085\tGO_REF:0000002\tIEA\tInterPro:IPR011527\tP\tMoeD5\tmoeD5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A002\tA0A002\t\tGO:0000166\tGO_REF:0000038\tIEA\tUniProtKB-KW:KW-0547\tF\tMoeJ5\t\tprotein\ttaxon:67581\t20190406\tUniProt\t\t\n" ] } ], "source": [ "%%bash\n", "head -n 25 goa_uniprot_all.gaf" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "format-version: 1.2\n", "subsetdef: gocheck_do_not_annotate \"Term not to be used for direct annotation\"\n", "subsetdef: gocheck_do_not_manually_annotate \"Term not to be used for direct manual annotation\"\n", "subsetdef: goslim_agr \"AGR slim\"\n", "subsetdef: goslim_aspergillus \"Aspergillus GO slim\"\n", "subsetdef: goslim_candida \"Candida GO slim\"\n", "subsetdef: goslim_chembl \"ChEMBL protein targets summary\"\n", "subsetdef: goslim_flybase_ribbon \"FlyBase Drosophila GO ribbon slim\"\n", "subsetdef: goslim_generic \"Generic GO slim\"\n", "subsetdef: goslim_metagenomics \"Metagenomics GO slim\"\n", "subsetdef: goslim_mouse \"Mouse GO slim\"\n", "subsetdef: goslim_pir \"PIR GO slim\"\n", "subsetdef: goslim_plant \"Plant GO slim\"\n", "subsetdef: goslim_pombe \"Fission yeast GO slim\"\n", "subsetdef: goslim_synapse \"synapse GO slim\"\n", "subsetdef: goslim_yeast \"Yeast GO slim\"\n", "synonymtypedef: syngo_official_label \"label approved by the SynGO project\"\n", "synonymtypedef: systematic_synonym \"Systematic synonym\" EXACT\n", "ontology: go/subsets/goslim_generic\n", "\n", "[Term]\n", "id: GO:0000003\n", "name: reproduction\n", "namespace: biological_process\n", "alt_id: GO:0019952\n" ] } ], "source": [ "%%bash\n", "head -n 25 goslim_generic.obo" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Extract just biological processes from the GOA file\n", "\n", "Represented by `P` in the \"Aspect\" column (Column #8)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Line count:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "612663769 goa_uniprot_all.gaf\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "real\t15m51.024s\n", "user\t0m27.220s\n", "sys\t0m51.784s\n" ] } ], "source": [ "%%bash\n", "time wc -l goa_uniprot_all.gaf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use awk to pull out desired records.\n", "\n", "But first, verify awk command does what we want..." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "UniProtKB\tA0A000\tmoeA5\t\tGO:0009058\tGO_REF:0000002\tIEA\tInterPro:IPR004839\tP\tMoeA5\tmoeA5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A000\tmoeA5\t\tGO:0033014\tGO_REF:0000002\tIEA\tInterPro:IPR010961\tP\tMoeA5\tmoeA5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A001\tmoeD5\t\tGO:0055085\tGO_REF:0000002\tIEA\tInterPro:IPR011527\tP\tMoeD5\tmoeD5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A002\tA0A002\t\tGO:0055085\tGO_REF:0000002\tIEA\tInterPro:IPR011527\tP\tMoeJ5\t\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A004\tmoeF5\t\tGO:0006529\tGO_REF:0000002\tIEA\tInterPro:IPR001962|InterPro:IPR006426\tP\tMoeF5\tmoeF5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A004\tmoeF5\t\tGO:0006529\tGO_REF:0000038\tIEA\tUniProtKB-KW:KW-0061\tP\tMoeF5\tmoeF5\tprotein\ttaxon:67581\t20190406\tUniProt\t\t\n", "UniProtKB\tA0A004\tmoeF5\t\tGO:0006541\tGO_REF:0000038\tIEA\tUniProtKB-KW:KW-0315\tP\tMoeF5\tmoeF5\tprotein\ttaxon:67581\t20190406\tUniProt\t\t\n", "UniProtKB\tA0A004\tmoeF5\t\tGO:0008652\tGO_REF:0000038\tIEA\tUniProtKB-KW:KW-0028\tP\tMoeF5\tmoeF5\tprotein\ttaxon:67581\t20190406\tUniProt\t\t\n", "UniProtKB\tA0A009\tmoeM5\t\tGO:0009058\tGO_REF:0000002\tIEA\tInterPro:IPR003696\tP\tMoeM5\tmoeM5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n" ] } ], "source": [ "%%bash\n", "head -n 50 goa_uniprot_all.gaf | awk '{if ($8==\"P\") print}'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, let's put all of them in a new file" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n", "real\t22m15.433s\n", "user\t8m9.976s\n", "sys\t1m37.932s\n" ] } ], "source": [ "%%bash\n", "time \\\n", "awk '{if ($8==\"P\") print}' goa_uniprot_all.gaf \\\n", "> goa_uniprot_P.gaf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Count biological process entries in new file" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Line count:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "181626721 goa_uniprot_P.gaf\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "real\t4m17.147s\n", "user\t0m7.680s\n", "sys\t0m13.904s\n" ] } ], "source": [ "%%bash\n", "time \\\n", "wc -l goa_uniprot_P.gaf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Pull GOslim terms to use as list for pulling from goa_uniprot_P.gaf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Counts to confirm that parsing strategy will work.\n", "\n", "If it works correctly, both `grep` commands should yield same number." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GOslim Term counts\n", "147\n", "----------------------------------\n", "----------------------------------\n", "GOslim ID counts\n", "147\n" ] } ], "source": [ "%%bash\n", "echo \"GOslim Term counts\"\n", "# Search for lines beginning (^) with [Term]\n", "# Backslashes needed to escape brackets to prevent interpretation as regex\n", "grep \"^\\[Term\\]\" goslim_generic.obo | wc -l\n", "echo \"----------------------------------\"\n", "echo \"----------------------------------\"\n", "# Search for lines beginning (^) wiht id: GO:\n", "echo \"GOslim ID counts\"\n", "grep \"^id: GO:\" goslim_generic.obo | wc -l" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It worked, so let's create a list of GOslim IDs" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "147 goslim_ID_list.txt\n", "GO:0000003\n", "GO:0000228\n", "GO:0000229\n", "GO:0000278\n", "GO:0000902\n", "GO:0002376\n", "GO:0003013\n", "GO:0003674\n", "GO:0003677\n", "GO:0003700\n" ] } ], "source": [ "%%bash\n", "grep \"^id: GO:\" goslim_generic.obo \\\n", "| awk '{print $2}' \\\n", "> goslim_ID_list.txt\n", "\n", "wc -l goslim_ID_list.txt\n", "\n", "head goslim_ID_list.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create file of GO biological processes (P) UniProt terms that match GOslims" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "28391971 uniprot_goslim_P.txt\n", "UniProtKB\tA0A000\tmoeA5\t\tGO:0009058\tGO_REF:0000002\tIEA\tInterPro:IPR004839\tP\tMoeA5\tmoeA5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A001\tmoeD5\t\tGO:0055085\tGO_REF:0000002\tIEA\tInterPro:IPR011527\tP\tMoeD5\tmoeD5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A002\tA0A002\t\tGO:0055085\tGO_REF:0000002\tIEA\tInterPro:IPR011527\tP\tMoeJ5\t\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A009\tmoeM5\t\tGO:0009058\tGO_REF:0000002\tIEA\tInterPro:IPR003696\tP\tMoeM5\tmoeM5\tprotein\ttaxon:67581\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A009DWE1\tJ504_3685\t\tGO:0055085\tGO_REF:0000002\tIEA\tInterPro:IPR001036\tP\tAcrB/AcrD/AcrF family protein\tJ504_3685\tprotein\ttaxon:1310605\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A009DWJ5\tJ504_3662\t\tGO:0032196\tGO_REF:0000002\tIEA\tInterPro:IPR038965\tP\tPutative transposase\tJ504_3662\tprotein\ttaxon:1310605\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A009DWL0\tJ504_3657\t\tGO:0032196\tGO_REF:0000002\tIEA\tInterPro:IPR038965\tP\tPutative iSRSO8-transposase orfB protein\tJ504_3657\tprotein\ttaxon:1310605\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A009E3I5\tJ504_3523\t\tGO:0032196\tGO_REF:0000002\tIEA\tInterPro:IPR038965\tP\tIntegrase core domain protein\tJ504_3523\tprotein\ttaxon:1310605\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A009E5V4\tJ504_3420\t\tGO:0032196\tGO_REF:0000002\tIEA\tInterPro:IPR038965\tP\tIntegrase core domain protein\tJ504_3420|J504_3472\tprotein\ttaxon:1310605\t20190406\tInterPro\t\t\n", "UniProtKB\tA0A009E6I2\tJ504_3410\t\tGO:0032196\tGO_REF:0000002\tIEA\tInterPro:IPR038965\tP\tIntegrase core domain protein\tJ504_3410\tprotein\ttaxon:1310605\t20190406\tInterPro\t\t\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "real\t1m41.021s\n", "user\t1m27.764s\n", "sys\t0m10.892s\n" ] } ], "source": [ "%%bash\n", "time \\\n", "grep --file=goslim_ID_list.txt goa_uniprot_P.gaf \\\n", "> uniprot_goslim_P.txt\n", "\n", "wc -l uniprot_goslim_P.txt\n", "\n", "head uniprot_goslim_P.txt" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-rw-rw-r-- 1 sam sam 4.7G Apr 24 14:04 uniprot_goslim_P.txt\n" ] } ], "source": [ "ls -lh uniprot_goslim_P.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Alrighty, I _think_ this can be used as an updated replacement for our \"undocumented\" and outdated GOslim file:\n", "\n", "- http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/GO-GOslim.sorted" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sending incremental file list\n", "uniprot_goslim_P.txt\n", "\n", "sent 4,944,356,869 bytes received 34 bytes 106,330,255.98 bytes/sec\n", "total size is 4,943,149,944 speedup is 1.00\n" ] } ], "source": [ "%%bash\n", "rsync -av uniprot_goslim_P.txt \\\n", "gannet:/volume2/web/Atumefaciens/20190424_uniprot_go_goslim_P" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }