{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Wed Oct 30 08:34:26 PDT 2019\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.6 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "swoose\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 24\n", "On-line CPU(s) list: 0-23\n", "Thread(s) per core: 2\n", "Core(s) per socket: 6\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 44\n", "Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz\n", "Stepping: 2\n", "CPU MHz: 2925.971\n", "BogoMIPS: 5851.97\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 12288K\n", "NUMA node0 CPU(s): 0-23\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 70G 29G 7.0G 688M 34G 39G\n", "Swap: 4.7G 15M 4.6G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set variables\n", "`%env` variables are good for passing to bash cells" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: wd=/home/sam/analyses/20191030_pgen_v074.a4_intron_intergenic_features\n", "env: rsync_owl=owl:/volume1/web/halfshell/genomic-databank/\n", "env: genome_fasta=Pgenerosa_v074.fa\n", "env: chrome_sizes=Pgenerosa_v074.sizes.txt\n", "env: gffs=Panopea-generosa-vv0.74.a4.[eg]*.gff\n", "env: exon_gff=Panopea-generosa-vv0.74.a4.exon.gff3\n", "env: exon_gff_sorted=Panopea-generosa-vv0.74.a4.exon.sorted.gff3\n", "env: gene_gff=Panopea-generosa-vv0.74.a4.gene.gff3\n", "env: gene_gff_sorted=Panopea-generosa-vv0.74.a4.gene.sorted.gff3\n", "env: wget_gff=--directory-prefix=$/home/sam/analyses/20191030_pgen_v074.a4_intron_intergenic_features --recursive --quiet --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a4.[eg]*.gff' https://owl.fish.washington.edu/halfshell/genomic-databank/\n", "env: wget_fasta=--no-check-certificate https://owl.fish.washington.edu/halfshell/genomic-databank/\"${genome_fasta}\"\n", "env: exon_comp_bed=Panopea-generosa-vv0.74.a4.exon.sorted.comp.bed\n", "env: intron_bed=Panopea-generosa-vv0.74.a4.introns.bed\n", "env: intergenic_bed=Panopea-generosa-vv0.74.a4.intergenic.bed\n", "env: bedtools_dir=/home/sam/programs/bedtools-2.28.0/bin\n", "env: samtools=/home/sam/programs/samtools-1.9/samtools\n" ] } ], "source": [ "# Set workding directory\n", "%env wd=/home/sam/analyses/20191030_pgen_v074.a4_intron_intergenic_features\n", "wd=\"/home/sam/analyses/20191030_pgen_v074.a4_intron_intergenic_features\"\n", "\n", "\n", "%env rsync_owl=owl:/volume1/web/halfshell/genomic-databank/\n", "%env genome_fasta=Pgenerosa_v074.fa\n", "%env chrome_sizes=Pgenerosa_v074.sizes.txt\n", "%env gffs=Panopea-generosa-vv0.74.a4.[eg]*.gff\n", "%env exon_gff=Panopea-generosa-vv0.74.a4.exon.gff3\n", "%env exon_gff_sorted=Panopea-generosa-vv0.74.a4.exon.sorted.gff3\n", "%env gene_gff=Panopea-generosa-vv0.74.a4.gene.gff3\n", "%env gene_gff_sorted=Panopea-generosa-vv0.74.a4.gene.sorted.gff3\n", "%env wget_gff=--directory-prefix=${wd} --recursive --quiet --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a4.[eg]*.gff' https://owl.fish.washington.edu/halfshell/genomic-databank/\n", "%env wget_fasta=--no-check-certificate https://owl.fish.washington.edu/halfshell/genomic-databank/\"${genome_fasta}\"\n", "%env exon_comp_bed=Panopea-generosa-vv0.74.a4.exon.sorted.comp.bed\n", "%env intron_bed=Panopea-generosa-vv0.74.a4.introns.bed\n", "%env intergenic_bed=Panopea-generosa-vv0.74.a4.intergenic.bed\n", "\n", "# Programs\n", "%env bedtools_dir=/home/sam/programs/bedtools-2.28.0/bin\n", "%env samtools=/home/sam/programs/samtools-1.9/samtools\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create necessary directories" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash\n", "mkdir --parents ${wd}" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/sam/analyses/20191030_pgen_v074.a4_intron_intergenic_features\n" ] } ], "source": [ "cd {wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Download _Panopea generosa_ GFFs for v074.a4.\n", "\n", "Info on GFFs is here: [https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome-feature-tracks-3)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "receiving incremental file list\n", "Pgenerosa_v074.fa\n", "\n", "sent 30 bytes received 958,176,954 bytes 27,773,245.91 bytes/sec\n", "total size is 958,059,901 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.exon.gff3\n", "\n", "sent 30 bytes received 64,671,619 bytes 14,371,477.56 bytes/sec\n", "total size is 64,663,603 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.gene.gff3\n", "\n", "sent 30 bytes received 10,999,145 bytes 4,399,670.00 bytes/sec\n", "total size is 10,997,681 speedup is 1.00\n", "\n", "\n", "----------------------------------------------------------\n", "\n", "total 986M\n", "-rwxr--r-- 1 sam users 62M Oct 14 10:13 Panopea-generosa-vv0.74.a4.exon.gff3\n", "-rwxr--r-- 1 sam users 11M Oct 14 10:13 Panopea-generosa-vv0.74.a4.gene.gff3\n", "-rw-rw-rw- 1 sam users 914M Jun 26 08:49 Pgenerosa_v074.fa\n" ] } ], "source": [ "%%bash\n", "\n", "input_files_array=(\"${genome_fasta}\" \"${exon_gff}\" \"${gene_gff}\")\n", "\n", "for file in \"${input_files_array[@]}\"\n", "do\n", " rsync \\\n", " --archive \\\n", " --verbose \\\n", " \"${rsync_owl}${file}\" \\\n", " .\n", "done\n", "echo \"\"\n", "echo \"\"\n", "echo \"----------------------------------------------------------\"\n", "echo \"\"\n", "\n", "ls -lh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### If need to download via wget, uncomment lines in the cell below" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# %%bash\n", "# time \\\n", "# wget \"${wget_gffs}\"\n", "# wget \"${wget_fasta}\"\n", "# ls -lh ${wd}" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Panopea-generosa-vv0.74.a4.exon.gff3\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d9637f372b5d-publish\texon\t2\t125\t.\t+\t.\tID=PGEN_.00g000010.m01.exon01;Name=PGEN_.00g000010.m01.exon01;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon1;Alias=21510-PGEN_.00g234140.m01.exon1\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d9637f372b5d-publish\texon\t1995\t2095\t.\t+\t.\tID=PGEN_.00g000010.m01.exon02;Name=PGEN_.00g000010.m01.exon02;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon2;Alias=21510-PGEN_.00g234140.m01.exon2\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d9637f372b5d-publish\texon\t3325\t3495\t.\t+\t.\tID=PGEN_.00g000010.m01.exon03;Name=PGEN_.00g000010.m01.exon03;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon3;Alias=21510-PGEN_.00g234140.m01.exon3\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d9637f372b5d-publish\texon\t4651\t4719\t.\t+\t.\tID=PGEN_.00g000010.m01.exon04;Name=PGEN_.00g000010.m01.exon04;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon4;Alias=21510-PGEN_.00g234140.m01.exon4\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d9637f372b5d-publish\texon\t19808\t19943\t.\t-\t.\tID=PGEN_.00g000020.m01.exon01;Name=PGEN_.00g000020.m01.exon01;Parent=PGEN_.00g000020.m01;original_ID=21510-PGEN_.00g234150.m01.exon10;Alias=21510-PGEN_.00g234150.m01.exon10\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d9637f372b5d-publish\texon\t21133\t21362\t.\t-\t.\tID=PGEN_.00g000020.m01.exon02;Name=PGEN_.00g000020.m01.exon02;Parent=PGEN_.00g000020.m01;original_ID=21510-PGEN_.00g234150.m01.exon9;Alias=21510-PGEN_.00g234150.m01.exon9\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d9637f372b5d-publish\texon\t22487\t22613\t.\t-\t.\tID=PGEN_.00g000020.m01.exon03;Name=PGEN_.00g000020.m01.exon03;Parent=PGEN_.00g000020.m01;original_ID=21510-PGEN_.00g234150.m01.exon8;Alias=21510-PGEN_.00g234150.m01.exon8\n", "\n", "----------------------\n", "\n", "Panopea-generosa-vv0.74.a4.gene.gff3\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d9637f372b5d-publish\tgene\t2\t4719\t.\t+\t.\tID=PGEN_.00g000010;Name=PGEN_.00g000010;original_ID=21510-PGEN_.00g234140;Alias=21510-PGEN_.00g234140;original_name=21510-PGEN_.00g234140;Notes=sp|Q86IC9|CAMT1_DICDI [BLAST protein vs protein (blastp) 2.7.1],PF01596.12 [Pfam 1.6]\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d9637f372b5d-publish\tgene\t19808\t36739\t.\t-\t.\tID=PGEN_.00g000020;Name=PGEN_.00g000020;original_ID=21510-PGEN_.00g234150;Alias=21510-PGEN_.00g234150;original_name=21510-PGEN_.00g234150;Notes=sp|P04177|TY3H_RAT [BLAST protein vs protein (blastp) 2.7.1],sp|P04177|TY3H_RAT [DIAMOND Functional 0.9.22],IPR036951 [InterProScan 5.29-68.0],PF00351.16 [Pfam 1.6]\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d9637f372b5d-publish\tgene\t49248\t52578\t.\t-\t.\tID=PGEN_.00g000030;Name=PGEN_.00g000030;original_ID=21510-PGEN_.00g234160;Alias=21510-PGEN_.00g234160;original_name=21510-PGEN_.00g234160;Notes=PF08054.6 [Pfam 1.6]\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d9637f372b5d-publish\tgene\t55792\t67546\t.\t+\t.\tID=PGEN_.00g000040;Name=PGEN_.00g000040;original_ID=21510-PGEN_.00g234170;Alias=21510-PGEN_.00g234170;original_name=21510-PGEN_.00g234170\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d9637f372b5d-publish\tgene\t67586\t69113\t.\t-\t.\tID=PGEN_.00g000050;Name=PGEN_.00g000050;original_ID=21510-PGEN_.00g234180;Alias=21510-PGEN_.00g234180;original_name=21510-PGEN_.00g234180;Notes=sp|Q8L840|RQL4A_ARATH [BLAST protein vs protein (blastp) 2.7.1],sp|Q8L840|RQL4A_ARATH [DIAMOND Functional 0.9.22],PF00270.24 [Pfam 1.6]\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d9637f372b5d-publish\tgene\t70713\t81099\t.\t+\t.\tID=PGEN_.00g000060;Name=PGEN_.00g000060;original_ID=21510-PGEN_.00g234190;Alias=21510-PGEN_.00g234190;original_name=21510-PGEN_.00g234190;Notes=sp|Q61043|NIN_MOUSE [DIAMOND Functional 0.9.22],PF04443.7 [Pfam 1.6]\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d9637f372b5d-publish\tgene\t183686\t186073\t.\t+\t.\tID=PGEN_.00g000070;Name=PGEN_.00g000070;original_ID=21510-PGEN_.00g234200;Alias=21510-PGEN_.00g234200;original_name=21510-PGEN_.00g234200;Notes=PF15364.1 [Pfam 1.6]\n" ] } ], "source": [ "%%bash\n", "echo ${exon_gff}\n", "head ${exon_gff}\n", "echo \"\"\n", "echo \"----------------------\"\n", "echo \"\"\n", "echo ${gene_gff}\n", "head ${gene_gff}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sort files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create FastA index file" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PGA_scaffold1__77_contigs__length_89643857\t89643857\t44\t60\t61\n", "PGA_scaffold2__36_contigs__length_69596280\t69596280\t91138010\t60\t61\n", "PGA_scaffold3__111_contigs__length_57743597\t57743597\t161894273\t60\t61\n", "PGA_scaffold4__129_contigs__length_65288255\t65288255\t220600309\t60\t61\n", "PGA_scaffold5__109_contigs__length_67248332\t67248332\t286976747\t60\t61\n", "PGA_scaffold6__104_contigs__length_61759565\t61759565\t355345930\t60\t61\n", "PGA_scaffold7__69_contigs__length_43120122\t43120122\t418134866\t60\t61\n", "PGA_scaffold8__63_contigs__length_61151155\t61151155\t461973701\t60\t61\n", "PGA_scaffold9__45_contigs__length_38581958\t38581958\t524144086\t60\t61\n", "PGA_scaffold10__49_contigs__length_53961475\t53961475\t563369122\t60\t61\n", "PGA_scaffold11__79_contigs__length_51449921\t51449921\t618230000\t60\t61\n", "PGA_scaffold12__71_contigs__length_50438331\t50438331\t670537465\t60\t61\n", "PGA_scaffold13__52_contigs__length_44396874\t44396874\t721816480\t60\t61\n", "PGA_scaffold14__91_contigs__length_45393038\t45393038\t766953347\t60\t61\n", "PGA_scaffold15__101_contigs__length_47938513\t47938513\t813102982\t60\t61\n", "PGA_scaffold16__33_contigs__length_31980953\t31980953\t861840516\t60\t61\n", "PGA_scaffold17__51_contigs__length_34923512\t34923512\t894354530\t60\t61\n", "PGA_scaffold18__69_contigs__length_27737463\t27737463\t929860146\t60\t61\n" ] } ], "source": [ "%%bash\n", "${samtools} faidx \\\n", "${genome_fasta}\n", "\n", "cat ${genome_fasta}.fai" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create chromosome sizes file" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PGA_scaffold10__49_contigs__length_53961475\t53961475\n", "PGA_scaffold11__79_contigs__length_51449921\t51449921\n", "PGA_scaffold12__71_contigs__length_50438331\t50438331\n", "PGA_scaffold13__52_contigs__length_44396874\t44396874\n", "PGA_scaffold14__91_contigs__length_45393038\t45393038\n", "PGA_scaffold15__101_contigs__length_47938513\t47938513\n", "PGA_scaffold16__33_contigs__length_31980953\t31980953\n", "PGA_scaffold17__51_contigs__length_34923512\t34923512\n", "PGA_scaffold18__69_contigs__length_27737463\t27737463\n", "PGA_scaffold1__77_contigs__length_89643857\t89643857\n", "PGA_scaffold2__36_contigs__length_69596280\t69596280\n", "PGA_scaffold3__111_contigs__length_57743597\t57743597\n", "PGA_scaffold4__129_contigs__length_65288255\t65288255\n", "PGA_scaffold5__109_contigs__length_67248332\t67248332\n", "PGA_scaffold6__104_contigs__length_61759565\t61759565\n", "PGA_scaffold7__69_contigs__length_43120122\t43120122\n", "PGA_scaffold8__63_contigs__length_61151155\t61151155\n", "PGA_scaffold9__45_contigs__length_38581958\t38581958\n" ] } ], "source": [ "%%bash\n", "# Change LC_COLLATE to use C for proper sorting of file\n", "export LC_COLLATE=C\n", "\n", "cut -f1,2 ${genome_fasta}.fai | sort > ${chrome_sizes}\n", "\n", "cat ${chrome_sizes}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Sort GFFs" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Previewing Panopea-generosa-vv0.74.a4.exon.sorted.gff3:\n", "\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "PGA_scaffold10__49_contigs__length_53961475\tGenSAS_5d9637f372b5d-publish\texon\t2\t1145\t.\t+\t.\tID=PGEN_.00g204300.m01.exon01;Name=PGEN_.00g204300.m01.exon01;Parent=PGEN_.00g204300.m01;original_ID=21510-PGEN_.00g311420.m01.exon1;Alias=21510-PGEN_.00g311420.m01.exon1\n", "PGA_scaffold10__49_contigs__length_53961475\tGenSAS_5d9637f372b5d-publish\texon\t19540\t19683\t.\t-\t.\tID=PGEN_.00g204310.m01.exon01;Name=PGEN_.00g204310.m01.exon01;Parent=PGEN_.00g204310.m01;original_ID=21510-PGEN_.00g311430.m01.exon9;Alias=21510-PGEN_.00g311430.m01.exon9\n", "PGA_scaffold10__49_contigs__length_53961475\tGenSAS_5d9637f372b5d-publish\texon\t20883\t21134\t.\t-\t.\tID=PGEN_.00g204310.m01.exon02;Name=PGEN_.00g204310.m01.exon02;Parent=PGEN_.00g204310.m01;original_ID=21510-PGEN_.00g311430.m01.exon8;Alias=21510-PGEN_.00g311430.m01.exon8\n", "PGA_scaffold10__49_contigs__length_53961475\tGenSAS_5d9637f372b5d-publish\texon\t21900\t21978\t.\t-\t.\tID=PGEN_.00g204310.m01.exon03;Name=PGEN_.00g204310.m01.exon03;Parent=PGEN_.00g204310.m01;original_ID=21510-PGEN_.00g311430.m01.exon7;Alias=21510-PGEN_.00g311430.m01.exon7\n", "PGA_scaffold10__49_contigs__length_53961475\tGenSAS_5d9637f372b5d-publish\texon\t23163\t23251\t.\t-\t.\tID=PGEN_.00g204310.m01.exon04;Name=PGEN_.00g204310.m01.exon04;Parent=PGEN_.00g204310.m01;original_ID=21510-PGEN_.00g311430.m01.exon6;Alias=21510-PGEN_.00g311430.m01.exon6\n", "PGA_scaffold10__49_contigs__length_53961475\tGenSAS_5d9637f372b5d-publish\texon\t24166\t24255\t.\t-\t.\tID=PGEN_.00g204310.m01.exon05;Name=PGEN_.00g204310.m01.exon05;Parent=PGEN_.00g204310.m01;original_ID=21510-PGEN_.00g311430.m01.exon5;Alias=21510-PGEN_.00g311430.m01.exon5\n", "PGA_scaffold10__49_contigs__length_53961475\tGenSAS_5d9637f372b5d-publish\texon\t26335\t26460\t.\t-\t.\tID=PGEN_.00g204310.m01.exon06;Name=PGEN_.00g204310.m01.exon06;Parent=PGEN_.00g204310.m01;original_ID=21510-PGEN_.00g311430.m01.exon4;Alias=21510-PGEN_.00g311430.m01.exon4\n", "\n", "Confirming sort order of Panopea-generosa-vv0.74.a4.exon.sorted.gff3:\n", "\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "PGA_scaffold10__49_contigs__length_53961475\n", "PGA_scaffold11__79_contigs__length_51449921\n", "PGA_scaffold12__71_contigs__length_50438331\n", "PGA_scaffold13__52_contigs__length_44396874\n", "PGA_scaffold14__91_contigs__length_45393038\n", "PGA_scaffold15__101_contigs__length_47938513\n", "PGA_scaffold16__33_contigs__length_31980953\n", "PGA_scaffold17__51_contigs__length_34923512\n", "PGA_scaffold18__69_contigs__length_27737463\n", "PGA_scaffold1__77_contigs__length_89643857\n", "PGA_scaffold2__36_contigs__length_69596280\n", "PGA_scaffold3__111_contigs__length_57743597\n", "PGA_scaffold4__129_contigs__length_65288255\n", "PGA_scaffold5__109_contigs__length_67248332\n", "PGA_scaffold6__104_contigs__length_61759565\n", "PGA_scaffold7__69_contigs__length_43120122\n", "PGA_scaffold8__63_contigs__length_61151155\n", "PGA_scaffold9__45_contigs__length_38581958\n", "\n", "Previewing Panopea-generosa-vv0.74.a4.gene.sorted.gff3:\n", "\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "PGA_scaffold10__49_contigs__length_53961475\tGenSAS_5d9637f372b5d-publish\tgene\t2\t1145\t.\t+\t.\tID=PGEN_.00g204300;Name=PGEN_.00g204300;original_ID=21510-PGEN_.00g311420;Alias=21510-PGEN_.00g311420;original_name=21510-PGEN_.00g311420\n", "PGA_scaffold10__49_contigs__length_53961475\tGenSAS_5d9637f372b5d-publish\tgene\t19540\t36845\t.\t-\t.\tID=PGEN_.00g204310;Name=PGEN_.00g204310;original_ID=21510-PGEN_.00g311430;Alias=21510-PGEN_.00g311430;original_name=21510-PGEN_.00g311430;Notes=sp|F1MNN4|FBXW7_BOVIN [BLAST protein vs protein (blastp) 2.7.1],sp|A1DHW6|SCONB_NEOFI [DIAMOND Functional 0.9.22],IPR001680 [InterProScan 5.29-68.0],PF04041.8 [Pfam 1.6]\n", "PGA_scaffold10__49_contigs__length_53961475\tGenSAS_5d9637f372b5d-publish\tgene\t36984\t37535\t.\t-\t.\tID=PGEN_.00g204320;Name=PGEN_.00g204320;original_ID=21510-PGEN_.00g311440;Alias=21510-PGEN_.00g311440;original_name=21510-PGEN_.00g311440;Notes=IPR036322 [InterProScan 5.29-68.0],PF11163.3 [Pfam 1.6]\n", "PGA_scaffold10__49_contigs__length_53961475\tGenSAS_5d9637f372b5d-publish\tgene\t41157\t41960\t.\t-\t.\tID=PGEN_.00g204330;Name=PGEN_.00g204330;original_ID=21510-PGEN_.00g311450;Alias=21510-PGEN_.00g311450;original_name=21510-PGEN_.00g311450\n", "PGA_scaffold10__49_contigs__length_53961475\tGenSAS_5d9637f372b5d-publish\tgene\t61914\t82099\t.\t+\t.\tID=PGEN_.00g204340;Name=PGEN_.00g204340;original_ID=21510-PGEN_.00g311460;Alias=21510-PGEN_.00g311460;original_name=21510-PGEN_.00g311460;Notes=sp|Q3KRG3|TSR2_DANRE [BLAST protein vs protein (blastp) 2.7.1],sp|Q3KRG3|TSR2_DANRE [DIAMOND Functional 0.9.22],IPR019398 [InterProScan 5.29-68.0],PF10273.4 [Pfam 1.6]\n", "PGA_scaffold10__49_contigs__length_53961475\tGenSAS_5d9637f372b5d-publish\tgene\t90961\t109412\t.\t-\t.\tID=PGEN_.00g204350;Name=PGEN_.00g204350;original_ID=21510-PGEN_.00g311470;Alias=21510-PGEN_.00g311470;original_name=21510-PGEN_.00g311470;Notes=PF07495.8 [Pfam 1.6]\n", "PGA_scaffold10__49_contigs__length_53961475\tGenSAS_5d9637f372b5d-publish\tgene\t135015\t215664\t.\t-\t.\tID=PGEN_.00g204360;Name=PGEN_.00g204360;original_ID=21510-PGEN_.00g311480;Alias=21510-PGEN_.00g311480;original_name=21510-PGEN_.00g311480;Notes=sp|Q7TMY8|HUWE1_MOUSE [BLAST protein vs protein (blastp) 2.7.1],sp|Q7Z6Z7|HUWE1_HUMAN [DIAMOND Functional 0.9.22],PF06012.7 [Pfam 1.6]\n", "\n", "Confirming sort order of Panopea-generosa-vv0.74.a4.gene.sorted.gff3:\n", "\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "PGA_scaffold10__49_contigs__length_53961475\n", "PGA_scaffold11__79_contigs__length_51449921\n", "PGA_scaffold12__71_contigs__length_50438331\n", "PGA_scaffold13__52_contigs__length_44396874\n", "PGA_scaffold14__91_contigs__length_45393038\n", "PGA_scaffold15__101_contigs__length_47938513\n", "PGA_scaffold16__33_contigs__length_31980953\n", "PGA_scaffold17__51_contigs__length_34923512\n", "PGA_scaffold18__69_contigs__length_27737463\n", "PGA_scaffold1__77_contigs__length_89643857\n", "PGA_scaffold2__36_contigs__length_69596280\n", "PGA_scaffold3__111_contigs__length_57743597\n", "PGA_scaffold4__129_contigs__length_65288255\n", "PGA_scaffold5__109_contigs__length_67248332\n", "PGA_scaffold6__104_contigs__length_61759565\n", "PGA_scaffold7__69_contigs__length_43120122\n", "PGA_scaffold8__63_contigs__length_61151155\n", "PGA_scaffold9__45_contigs__length_38581958\n", "\n" ] } ], "source": [ "%%bash\n", "gff_array=(\"${exon_gff}\" \"${gene_gff}\")\n", "sorted_gff_array=(\"${exon_gff_sorted}\" \"${gene_gff_sorted}\")\n", "\n", "for index in \"${!gff_array[@]}\"\n", "do\n", " { awk 'NR<4 {print}' \"${gff_array[index]}\"\n", " awk 'NR>3 {print}' \"${gff_array[index]}\" | \"${bedtools_dir}\"/bedtools sort -i -\n", " } >> \"${sorted_gff_array[index]}\"\n", " \n", " # Check out sorted GFFs\n", " echo \"Previewing ${sorted_gff_array[index]}:\"\n", " echo \"\"\n", " head \"${sorted_gff_array[index]}\"\n", " echo \"\"\n", " echo \"Confirming sort order of ${sorted_gff_array[index]}:\"\n", " echo \"\"\n", " cut -f1 \"${sorted_gff_array[index]}\" | uniq\n", " echo \"\"\n", "done" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create intergenic BED file" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Previewing Panopea-generosa-vv0.74.a4.intergenic.bed:\n", "\n", "PGA_scaffold10__49_contigs__length_53961475\t0\t1\n", "PGA_scaffold10__49_contigs__length_53961475\t1145\t19539\n", "PGA_scaffold10__49_contigs__length_53961475\t36845\t36983\n", "PGA_scaffold10__49_contigs__length_53961475\t37535\t41156\n", "PGA_scaffold10__49_contigs__length_53961475\t41960\t61913\n", "PGA_scaffold10__49_contigs__length_53961475\t82099\t90960\n", "PGA_scaffold10__49_contigs__length_53961475\t109412\t135014\n", "PGA_scaffold10__49_contigs__length_53961475\t215664\t218086\n", "PGA_scaffold10__49_contigs__length_53961475\t219013\t225102\n", "PGA_scaffold10__49_contigs__length_53961475\t230053\t232785\n" ] } ], "source": [ "%%bash\n", "\n", "\"${bedtools_dir}\"/complementBed \\\n", "-i \"${gene_gff_sorted}\" \\\n", "-g \"${chrome_sizes}\" \\\n", "> \"${intergenic_bed}\"\n", "\n", "echo \"Previewing ${intergenic_bed}:\"\n", "echo \"\"\n", "head \"${intergenic_bed}\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Make intron BED file" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Previewing Panopea-generosa-vv0.74.a4.exon.sorted.comp.bed:\n", "\n", "PGA_scaffold10__49_contigs__length_53961475\t0\t1\n", "PGA_scaffold10__49_contigs__length_53961475\t1145\t19539\n", "PGA_scaffold10__49_contigs__length_53961475\t19683\t20882\n", "PGA_scaffold10__49_contigs__length_53961475\t21134\t21899\n", "PGA_scaffold10__49_contigs__length_53961475\t21978\t23162\n", "PGA_scaffold10__49_contigs__length_53961475\t23251\t24165\n", "PGA_scaffold10__49_contigs__length_53961475\t24255\t26334\n", "PGA_scaffold10__49_contigs__length_53961475\t26460\t28467\n", "PGA_scaffold10__49_contigs__length_53961475\t28623\t35328\n", "PGA_scaffold10__49_contigs__length_53961475\t35935\t36093\n", "\n", "---------------------\n", "\n", "Previewing Panopea-generosa-vv0.74.a4.introns.bed:\n", "\n", "PGA_scaffold10__49_contigs__length_53961475\t19684\t20882\n", "PGA_scaffold10__49_contigs__length_53961475\t21135\t21899\n", "PGA_scaffold10__49_contigs__length_53961475\t21979\t23162\n", "PGA_scaffold10__49_contigs__length_53961475\t23252\t24165\n", "PGA_scaffold10__49_contigs__length_53961475\t24256\t26334\n", "PGA_scaffold10__49_contigs__length_53961475\t26461\t28467\n", "PGA_scaffold10__49_contigs__length_53961475\t35936\t36093\n", "PGA_scaffold10__49_contigs__length_53961475\t28624\t35328\n", "PGA_scaffold10__49_contigs__length_53961475\t61983\t65309\n", "PGA_scaffold10__49_contigs__length_53961475\t65860\t81002\n" ] } ], "source": [ "%%bash\n", "\"${bedtools_dir}\"/complementBed \\\n", "-i \"${exon_gff_sorted}\" \\\n", "-g \"${chrome_sizes}\" \\\n", "> \"${exon_comp_bed}\"\n", "\n", "echo \"Previewing ${exon_comp_bed}:\"\n", "echo \"\"\n", "head \"${exon_comp_bed}\"\n", "echo \"\"\n", "echo \"---------------------\"\n", "\n", "\"${bedtools_dir}\"/intersectBed \\\n", "-a \"${gene_gff_sorted}\" \\\n", "-b \"${exon_comp_bed}\" \\\n", "| awk -v OFS='\\t' '{print $1,$4,$5}' \\\n", "> \"${intron_bed}\"\n", "\n", "echo \"\"\n", "echo \"Previewing ${intron_bed}:\"\n", "echo \"\"\n", "head \"${intron_bed}\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cleanup" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 12M\n", "-rw-rw-r-- 1 sam sam 2.1M Oct 30 08:41 Panopea-generosa-vv0.74.a4.intergenic.bed\n", "-rw-rw-r-- 1 sam sam 9.1M Oct 30 08:41 Panopea-generosa-vv0.74.a4.introns.bed\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "rm: cannot remove 'Pgenerosa_v074.fa*': No such file or directory\n", "rm: cannot remove 'Panopea-generosa-vv0.74.a4.exon.sorted.comp.bed': No such file or directory\n" ] } ], "source": [ "%%bash\n", "rm \"${genome_fasta}\"* \"${exon_comp_bed}\" *.gff3 *.txt\n", "ls -ltrh" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }