{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### This notebook generates separate GFF files from the Pgenerosa_v070 annotated genome.\n", "\n", "##### Genome annotation details can be found here: https://robertslab.github.io/sams-notebook/2019/02/28/Genome-Annotation-Pgenerosa_v070-MAKER-on-Mox.html" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Set variables\n", "##### Variables preceded by '%' are usable by `bash` cells" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: work_dir=/home/sam/Downloads/20190523_pgen_Pgenerosa_v070_gffs\n", "env: pgen70_fasta_url=http://owl.fish.washington.edu/halfshell/genomic-databank/Pgenerosa_v070.fa\n", "env: pgen70_gff_url=http://gannet.fish.washington.edu/Atumefaciens/20190228_pgen_maker_v070_annotation/Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff\n" ] } ], "source": [ "%env work_dir = /home/sam/Downloads/20190523_pgen_Pgenerosa_v070_gffs\n", "%env pgen70_fasta_url = http://owl.fish.washington.edu/halfshell/genomic-databank/Pgenerosa_v070.fa\n", "%env pgen70_gff_url = http://gannet.fish.washington.edu/Atumefaciens/20190228_pgen_maker_v070_annotation/Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff\n", "\n", "work_dir = \"/home/sam/Downloads/20190523_pgen_Pgenerosa_v070_gffs\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "mkdir \"${work_dir}\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using Python variable allows persistent (i.e. across cells) directory change.\n", "#### Use of the `bash` variable version only changes directories for the duration of that cell." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/sam/Downloads/20190523_pgen_Pgenerosa_v070_gffs\n" ] } ], "source": [ "cd $work_dir" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Download files via `rsync`.\n", "\n", "An option is available to use `wget`." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "receiving incremental file list\n", "Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff\n", "\n", "sent 30 bytes received 7,597,953,355 bytes 98,038,108.19 bytes/sec\n", "total size is 7,597,025,836 speedup is 1.00\n", "receiving incremental file list\n", "Pgenerosa_v070.fa\n", "\n", "sent 30 bytes received 2,247,392,295 bytes 25,394,263.56 bytes/sec\n", "total size is 2,247,117,885 speedup is 1.00\n", "total 9.2G\n", "-rw-r--r-- 1 sam users 2.1G Feb 11 12:13 Pgenerosa_v070.fa\n", "-rw-r--r-- 1 sam users 7.1G May 19 02:16 Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff\n" ] } ], "source": [ "%%bash\n", "rsync -av gannet:/volume2/web/Atumefaciens/20190228_pgen_maker_v070_annotation/Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff .\n", "rsync -av owl:/volume1/web/halfshell/genomic-databank/Pgenerosa_v070.fa .\n", "\n", "# Uncomment below to download if you don't have rsync access\n", "# wget \"${pgen70_fasta_url}\"\n", "# wget \"${pgen70_gff_url}\"\n", "\n", "ls -lh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create separate GFF files for specific features and generate MD5 checksums for each new GFF." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1c4e0bd30579798948c8e1e32c7052e3 Pgenerosa_v070.CDS.gff\n", "6ea9360ea24ddfc4ed07486547c3319e Pgenerosa_v070.exon.gff\n", "cdb2fff32712dd4d9223dd8674b03a92 Pgenerosa_v070.gene.gff\n", "1a6e0f4d9e515fa2b7d84c07f66cd8da Pgenerosa_v070.mRNA.gff\n" ] } ], "source": [ "%%bash\n", "awk 'BEGIN { print \"##gff-version 3\" ; } $3 == \"CDS\" {print}' Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff > Pgenerosa_v070.CDS.gff\n", "awk 'BEGIN { print \"##gff-version 3\" ; } $3 == \"exon\" {print}' Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff > Pgenerosa_v070.exon.gff\n", "awk 'BEGIN { print \"##gff-version 3\" ; } $3 == \"gene\" {print}' Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff > Pgenerosa_v070.gene.gff\n", "awk 'BEGIN { print \"##gff-version 3\" ; } $3 == \"mRNA\" {print}' Pgenerosa_v070_genome_snap02.all.renamed.putative_function.gff > Pgenerosa_v070.mRNA.gff\n", "\n", "md5sum Pgenerosa_v070.*.gff" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Some quick counts of initial genome assembly and separate GFFs.\n", "\n", "NOTE: GFF counts should subtract 1 to account for GFF header line (was too lazy bother subtraction programmatically)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GENOME ASSEMBLY CONTIGS:\n", "313649\n", "-------------------------\n", "\n", "CODING SEQUENCE COUNTS:\n", "169461 Pgenerosa_v070.CDS.gff\n", "-------------------------\n", "\n", "EXON SEQUENCE COUNTS:\n", "175007 Pgenerosa_v070.exon.gff\n", "-------------------------\n", "\n", "GENE SEQUENCE COUNTS:\n", "53036 Pgenerosa_v070.gene.gff\n", "-------------------------\n", "\n", "mRNA SEQUENCE COUNTS:\n", "53036 Pgenerosa_v070.mRNA.gff\n", "-------------------------\n", "\n" ] } ], "source": [ "%%bash\n", "echo \"GENOME ASSEMBLY CONTIGS:\"\n", "grep -c \">\" Pgenerosa_v070.fa\n", "echo \"-------------------------\"\n", "echo \"\"\n", "echo \"CODING SEQUENCE COUNTS:\"\n", "wc -l Pgenerosa_v070.CDS.gff\n", "echo \"-------------------------\"\n", "echo \"\"\n", "echo \"EXON SEQUENCE COUNTS:\"\n", "wc -l Pgenerosa_v070.exon.gff\n", "echo \"-------------------------\"\n", "echo \"\"\n", "echo \"GENE SEQUENCE COUNTS:\"\n", "wc -l Pgenerosa_v070.gene.gff\n", "echo \"-------------------------\"\n", "echo \"\"\n", "echo \"mRNA SEQUENCE COUNTS:\"\n", "wc -l Pgenerosa_v070.mRNA.gff\n", "echo \"-------------------------\"\n", "echo \"\"\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Transfer new GFFs to cannonical data storage location" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sending incremental file list\n", "Pgenerosa_v070.CDS.gff\n", "Pgenerosa_v070.exon.gff\n", "Pgenerosa_v070.gene.gff\n", "Pgenerosa_v070.mRNA.gff\n", "\n", "sent 70,886,696 bytes received 95 bytes 10,905,660.15 bytes/sec\n", "total size is 70,869,112 speedup is 1.00\n" ] } ], "source": [ "%%bash\n", "rsync -av Pgenerosa_v070.*.gff owl:/volume1/web/halfshell/genomic-databank/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Guess I should glance at the new GFFs to make sure they look alright..." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "##gff-version 3\n", "PGA_scaffold19103__1_contigs__length_6100\tmaker\tCDS\t5114\t5297\t.\t-\t0\tID=PGEN_00029322-RA:cds;Parent=PGEN_00029322-RA;\n", "PGA_scaffold19103__1_contigs__length_6100\tmaker\tCDS\t3446\t3906\t.\t-\t2\tID=PGEN_00029322-RA:cds;Parent=PGEN_00029322-RA;\n", "PGA_scaffold229666__1_contigs__length_7299\tmaker\tCDS\t132\t152\t.\t+\t0\tID=PGEN_00026601-RA:cds;Parent=PGEN_00026601-RA;\n", "PGA_scaffold229666__1_contigs__length_7299\tmaker\tCDS\t3415\t3567\t.\t+\t0\tID=PGEN_00026601-RA:cds;Parent=PGEN_00026601-RA;\n", "PGA_scaffold229666__1_contigs__length_7299\tmaker\tCDS\t4891\t5004\t.\t+\t0\tID=PGEN_00026601-RA:cds;Parent=PGEN_00026601-RA;\n", "PGA_scaffold285130__1_contigs__length_6957\tmaker\tCDS\t4418\t4774\t.\t+\t0\tID=PGEN_00027325-RA:cds;Parent=PGEN_00027325-RA;\n", "PGA_scaffold132901__1_contigs__length_1641\tmaker\tCDS\t321\t545\t.\t+\t0\tID=PGEN_00047624-RA:cds;Parent=PGEN_00047624-RA;\n", "PGA_scaffold132901__1_contigs__length_1641\tmaker\tCDS\t867\t1397\t.\t+\t0\tID=PGEN_00047624-RA:cds;Parent=PGEN_00047624-RA;\n", "PGA_scaffold132901__1_contigs__length_1641\tmaker\tCDS\t912\t1615\t.\t-\t0\tID=PGEN_00047623-RA:cds;Parent=PGEN_00047623-RA;\n" ] } ], "source": [ "%%bash\n", "head Pgenerosa_v070.CDS.gff" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }