{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Tue Dec 17 07:00:28 PST 2019\n", "------------\n", "\n", "LSB Version:\tcore-9.20170808ubuntu1-noarch:security-9.20170808ubuntu1-noarch\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 18.04.3 LTS\n", "Release:\t18.04\n", "Codename:\tbionic\n", "\n", "------------\n", "HOSTNAME: \n", "computer\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 4\n", "On-line CPU(s) list: 0-3\n", "Thread(s) per core: 2\n", "Core(s) per socket: 2\n", "Socket(s): 1\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 58\n", "Model name: Intel(R) Core(TM) i7-3517U CPU @ 1.90GHz\n", "Stepping: 9\n", "CPU MHz: 1795.794\n", "CPU max MHz: 3000.0000\n", "CPU min MHz: 800.0000\n", "BogoMIPS: 4788.78\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 4096K\n", "NUMA node0 CPU(s): 0-3\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm cpuid_fault epb pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt dtherm ida arat pln pts md_clear flush_l1d\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 7.5G 1.8G 320M 628M 5.4G 4.8G\n", "Swap: 7.7G 12K 7.7G\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set variables¶\n", "%env are best for bash" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: wd=/home/samb/analyses/20191214_olur_genome_feature_and_intron_splitting\n", "env: downloads_list=Olurida_v081.fa.fai Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff Olurida_v081-20190709.exon.gff Olurida_v081-20190709.gene.gff\n", "env: rsync_owl=owl:/volume1/web/halfshell/genomic-databank/\n", "env: chrome_sizes=Olurida_v081.sizes.txt\n", "env: fa_index=Olurida_v081.fa.fai\n", "env: maker_gff=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff\n", "env: base_name=Olurida_v081-20190709\n", "env: exon_gff=Olurida_v081-20190709.exon.gff\n", "env: exon_gff_sorted=Olurida_v081-20190709.exon.sorted.gff3\n", "env: gene_gff=Olurida_v081-20190709.gene.gff\n", "env: gene_gff_sorted=Olurida_v081-20190709.gene.sorted.gff3\n", "env: exon_comp_bed=Olurida_v081-20190709.exon.sorted.comp.bed\n", "env: intron_bed=Olurida_v081-20190709.introns.bed\n", "env: intergenic_bed=Olurida_v081-20190709.intergenic.bed\n", "env: bedtools_dir=/home/samb/programs/bedtools-2.29.0/bin\n", "env: samtools=/home/samb/programs/samtools-1.10\n" ] } ], "source": [ "%env wd=/home/samb/analyses/20191214_olur_genome_feature_and_intron_splitting\n", "wd=\"/home/samb/analyses/20191214_olur_genome_feature_and_intron_splitting\"\n", "\n", "\n", "# File download\n", "%env downloads_list=Olurida_v081.fa.fai Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff Olurida_v081-20190709.exon.gff Olurida_v081-20190709.gene.gff\n", "%env rsync_owl=owl:/volume1/web/halfshell/genomic-databank/\n", "\n", "# Input/output files\n", "%env chrome_sizes=Olurida_v081.sizes.txt\n", "%env fa_index=Olurida_v081.fa.fai\n", "%env maker_gff=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff\n", "%env base_name=Olurida_v081-20190709\n", "%env exon_gff=Olurida_v081-20190709.exon.gff\n", "%env exon_gff_sorted=Olurida_v081-20190709.exon.sorted.gff3\n", "%env gene_gff=Olurida_v081-20190709.gene.gff\n", "%env gene_gff_sorted=Olurida_v081-20190709.gene.sorted.gff3\n", "\n", "%env exon_comp_bed=Olurida_v081-20190709.exon.sorted.comp.bed\n", "%env intron_bed=Olurida_v081-20190709.introns.bed\n", "%env intergenic_bed=Olurida_v081-20190709.intergenic.bed\n", "\n", "# Programs\n", "%env bedtools_dir=/home/samb/programs/bedtools-2.29.0/bin\n", "%env samtools=/home/samb/programs/samtools-1.10" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create necessary directories" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "mkdir --parents ${wd}" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/samb/analyses/20191214_olur_genome_feature_and_intron_splitting\n" ] } ], "source": [ "cd {wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Download files" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "receiving incremental file list\n", "\n", "sent 11 bytes received 63 bytes 11.38 bytes/sec\n", "total size is 6,103,215 speedup is 82,475.88\n", "receiving incremental file list\n", "Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff\n", "\n", "sent 30 bytes received 3,105,037,889 bytes 3,625,263.19 bytes/sec\n", "total size is 3,104,658,743 speedup is 1.00\n", "receiving incremental file list\n", "\n", "sent 11 bytes received 75 bytes 13.23 bytes/sec\n", "total size is 9,248,086 speedup is 107,535.88\n", "\n", "\n", "----------------------------------------------------------\n", "\n", "total 3.0G\n", "-rw-r--r-- 1 samb samb 16M Jul 16 07:10 Olurida_v081-20190709.exon.gff\n", "-rw-r--r-- 1 samb samb 8.9M Jul 16 07:10 Olurida_v081-20190709.gene.gff\n", "-rw-rw-rw- 1 samb samb 5.9M Aug 24 2018 Olurida_v081.fa.fai\n", "-rw-rw-r-- 1 samb samb 2.9G Dec 13 21:30 Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "X11 forwarding request failed on channel 0\n", "X11 forwarding request failed on channel 0\n", "Connection closed by 128.95.149.83 port 22\n", "rsync: connection unexpectedly closed (0 bytes received so far) [Receiver]\n", "rsync error: unexplained error (code 255) at io.c(235) [Receiver=3.1.2]\n", "X11 forwarding request failed on channel 0\n" ] } ], "source": [ "%%bash\n", "\n", "# Create array of files to download from list\n", "mapfile -t downloads_array < <(echo ${downloads_list} | tr \" \" \"\\n\")\n", "\n", "for file in \"${downloads_array[@]}\"\n", "do\n", " rsync \\\n", " --archive \\\n", " --verbose \\\n", " \"${rsync_owl}${file}\" \\\n", " .\n", "## Uncomment lines below if need to use wget for downloads\n", "# wget --quiet \\\n", "# --no-check-certificate \\\n", "# https://owl.fish.washington.edu/halfshell/genomic-databank/${file}\n", "done\n", "echo \"\"\n", "echo \"\"\n", "echo \"----------------------------------------------------------\"\n", "echo \"\"\n", "\n", "\n", "\n", "ls -lh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Separate features" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Here are the features and their counts in Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff:\n", "19565118 \n", " 163637 CDS\n", " 159429 contig\n", " 170394 exon\n", "1214512 expressed_sequence_match\n", " 14903 five_prime_UTR\n", " 32210 gene\n", "2099176 match\n", "7027538 match_part\n", " 32210 mRNA\n", " 332676 protein_match\n", " 13838 three_prime_UTR\n", "\n", "\n", "----------------------------------------------\n", "\n", "\n", "Olurida_v081-20190709.five_prime_UTR.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "Contig58217\tmaker\tfive_prime_UTR\t11504\t11541\t.\t-\t.\tID=OLUR_00019127-RA:five_prime_utr;Parent=OLUR_00019127-RA;\n", "Contig9540\tmaker\tfive_prime_UTR\t10078\t10179\t.\t-\t.\tID=OLUR_00018391-RA:five_prime_utr;Parent=OLUR_00018391-RA;\n", "Contig36645\tmaker\tfive_prime_UTR\t2185\t2225\t.\t+\t.\tID=OLUR_00022996-RA:five_prime_utr;Parent=OLUR_00022996-RA;\n", "Contig3008\tmaker\tfive_prime_UTR\t5726\t6482\t.\t-\t.\tID=OLUR_00018754-RA:five_prime_utr;Parent=OLUR_00018754-RA;\n", "\n", "\n", "Olurida_v081-20190709.three_prime_UTR.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "Contig1111\tmaker\tthree_prime_UTR\t24968\t25427\t.\t-\t.\tID=OLUR_00006628-RA:three_prime_utr;Parent=OLUR_00006628-RA;\n", "Contig2046\tmaker\tthree_prime_UTR\t17809\t17955\t.\t+\t.\tID=OLUR_00011450-RA:three_prime_utr;Parent=OLUR_00011450-RA;\n", "Contig2046\tmaker\tthree_prime_UTR\t18030\t18113\t.\t+\t.\tID=OLUR_00011450-RA:three_prime_utr;Parent=OLUR_00011450-RA;\n", "Contig2046\tmaker\tthree_prime_UTR\t18333\t18394\t.\t+\t.\tID=OLUR_00011450-RA:three_prime_utr;Parent=OLUR_00011450-RA;\n" ] } ], "source": [ "%%bash\n", "\n", "# Create UTR array\n", "utr_array=(five_prime_UTR three_prime_UTR)\n", "\n", "echo \"Here are the features and their counts in ${maker_gff}:\"\n", "awk 'NR>1 {print $3}' \"${maker_gff}\" | sort | uniq -c\n", "echo \"\"\n", "echo \"\"\n", "echo \"----------------------------------------------\"\n", "\n", "\n", "# Create GFFs\n", "for feature in \"${utr_array[@]}\"\n", "do\n", " output=\"${base_name}.${feature}.gff3\"\n", " head -n 1 \"${maker_gff}\" >> \"${output}\"\n", " awk -v feature=\"$feature\" '$3 == feature {print}' \"${maker_gff}\" \\\n", " >> \"${output}\"\n", " echo \"\"\n", " echo \"\"\n", " echo \"${output}\"\n", " echo \"----------------------------------------------\"\n", " head -n 5 \"${output}\"\n", "done" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check exon and gene GFFs" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Olurida_v081-20190709.exon.gff\n", "##gff-version 3\n", "Contig61093\tmaker\texon\t7493\t7773\t.\t+\t.\tID=OLUR_00020575-RA:exon:1553;Parent=OLUR_00020575-RA;\n", "Contig61093\tmaker\texon\t7814\t7946\t.\t+\t.\tID=OLUR_00020575-RA:exon:1554;Parent=OLUR_00020575-RA;\n", "Contig1111\tmaker\texon\t24968\t25494\t.\t-\t.\tID=OLUR_00006628-RA:exon:209;Parent=OLUR_00006628-RA;\n", "Contig1111\tmaker\texon\t26015\t26277\t.\t-\t.\tID=OLUR_00006628-RA:exon:208;Parent=OLUR_00006628-RA;\n", "Contig1111\tmaker\texon\t26889\t27080\t.\t-\t.\tID=OLUR_00006628-RA:exon:207;Parent=OLUR_00006628-RA;\n", "Contig1111\tmaker\texon\t27651\t27803\t.\t-\t.\tID=OLUR_00006628-RA:exon:206;Parent=OLUR_00006628-RA;\n", "Contig1111\tmaker\texon\t28523\t28696\t.\t-\t.\tID=OLUR_00006628-RA:exon:205;Parent=OLUR_00006628-RA;\n", "Contig214118\tmaker\texon\t201\t290\t.\t+\t.\tID=OLUR_00032161-RA:exon:2219;Parent=OLUR_00032161-RA;\n", "Contig214118\tmaker\texon\t807\t926\t.\t+\t.\tID=OLUR_00032161-RA:exon:2220;Parent=OLUR_00032161-RA;\n", "\n", "----------------------\n", "\n", "Olurida_v081-20190709.gene.gff\n", "##gff-version 3\n", "Contig61093\tmaker\tgene\t7493\t7946\t.\t+\t.\tID=OLUR_00020575;Name=OLUR_00020575;Alias=maker-Contig61093-snap-gene-0.2;Note=Protein of unknown function;\n", "Contig1111\tmaker\tgene\t24968\t28696\t.\t-\t.\tID=OLUR_00006628;Name=OLUR_00006628;Alias=maker-Contig1111-snap-gene-0.1;Note=Similar to Spag6: Sperm-associated antigen 6 (Mus musculus OX%3D10090);Dbxref=Gene3D:G3DSA:1.25.10.10,InterPro:IPR000225,InterPro:IPR000357,InterPro:IPR011989,InterPro:IPR016024,Pfam:PF02985,SMART:SM00185,SUPERFAMILY:SSF48371;Ontology_term=GO:0005515;\n", "Contig214118\tmaker\tgene\t201\t926\t.\t+\t.\tID=OLUR_00032161;Name=OLUR_00032161;Alias=maker-Contig214118-snap-gene-0.0;Note=Protein of unknown function;Dbxref=Gene3D:G3DSA:3.10.450.10;\n", "Contig58217\tmaker\tgene\t9736\t11541\t.\t-\t.\tID=OLUR_00019127;Name=OLUR_00019127;Alias=snap_masked-Contig58217-processed-gene-0.2;Note=Protein of unknown function;Dbxref=MobiDBLite:mobidb-lite;\n", "Contig2046\tmaker\tgene\t2295\t18394\t.\t+\t.\tID=OLUR_00011450;Name=OLUR_00011450;Alias=maker-Contig2046-snap-gene-0.5;Note=Protein of unknown function;\n", "Contig9540\tmaker\tgene\t4303\t10179\t.\t-\t.\tID=OLUR_00018391;Name=OLUR_00018391;Alias=maker-Contig9540-snap-gene-0.1;Note=Protein of unknown function;Dbxref=Gene3D:G3DSA:2.170.300.10;\n", "Contig52254\tmaker\tgene\t8908\t11733\t.\t+\t.\tID=OLUR_00011614;Name=OLUR_00011614;Alias=maker-Contig52254-snap-gene-0.1;Note=Similar to MM_0045: Putative ankyrin repeat protein MM_0045 (Methanosarcina mazei (strain ATCC BAA-159 / DSM 3647 / Goe1 / Go1 / JCM 11833 / OCM 88) OX%3D192952);Dbxref=CDD:cd00204,Gene3D:G3DSA:1.25.40.20,InterPro:IPR002110,InterPro:IPR020683,InterPro:IPR036770,PRINTS:PR01415,Pfam:PF12796,Pfam:PF13606,Pfam:PF13857,ProSiteProfiles:PS50088,ProSiteProfiles:PS50297,SMART:SM00248,SUPERFAMILY:SSF48403;Ontology_term=GO:0005515;\n", "Contig36645\tmaker\tgene\t2185\t3340\t.\t+\t.\tID=OLUR_00022996;Name=OLUR_00022996;Alias=maker-Contig36645-snap-gene-0.1;Note=Protein of unknown function;Dbxref=MobiDBLite:mobidb-lite;\n", "Contig3008\tmaker\tgene\t532\t6482\t.\t-\t.\tID=OLUR_00018754;Name=OLUR_00018754;Alias=maker-Contig3008-snap-gene-0.1;Note=Protein of unknown function;Dbxref=InterPro:IPR015919,SUPERFAMILY:SSF49313;Ontology_term=GO:0005509,GO:0016020;\n" ] } ], "source": [ "\n", "%%bash\n", "echo ${exon_gff}\n", "head ${exon_gff}\n", "echo \"\"\n", "echo \"----------------------\"\n", "echo \"\"\n", "echo ${gene_gff}\n", "head ${gene_gff}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create chromosome sizes file" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Contig0\t116746\n", "Contig1\t87411\n", "Contig10\t79314\n", "Contig100\t58076\n", "Contig1000\t25824\n", "Contig10000\t6887\n", "Contig100000\t1518\n", "Contig100001\t5004\n", "Contig100002\t1810\n", "Contig100003\t1990\n" ] } ], "source": [ "%%bash\n", "# Change LC_COLLATE to use C for proper sorting of file\n", "export LC_COLLATE=C\n", "\n", "cut -f1,2 ${fa_index} | sort > ${chrome_sizes}\n", "\n", "head ${chrome_sizes}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sort GFFs" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Previewing Olurida_v081-20190709.exon.sorted.gff3:\n", "\n", "##gff-version 3\n", "Contig0\tmaker\texon\t12497\t12571\t.\t+\t.\tID=OLUR_00000039-RA:exon:17;Parent=OLUR_00000039-RA;\n", "Contig0\tmaker\texon\t13175\t13233\t.\t+\t.\tID=OLUR_00000039-RA:exon:18;Parent=OLUR_00000039-RA;\n", "Contig0\tmaker\texon\t16209\t16373\t.\t+\t.\tID=OLUR_00000039-RA:exon:19;Parent=OLUR_00000039-RA;\n", "Contig0\tmaker\texon\t18857\t18959\t.\t+\t.\tID=OLUR_00000039-RA:exon:20;Parent=OLUR_00000039-RA;\n", "Contig0\tmaker\texon\t26493\t26574\t.\t+\t.\tID=OLUR_00000039-RA:exon:21;Parent=OLUR_00000039-RA;\n", "Contig0\tmaker\texon\t27806\t27862\t.\t+\t.\tID=OLUR_00000039-RA:exon:22;Parent=OLUR_00000039-RA;\n", "Contig0\tmaker\texon\t28510\t28590\t.\t+\t.\tID=OLUR_00000039-RA:exon:23;Parent=OLUR_00000039-RA;\n", "Contig0\tmaker\texon\t28735\t28775\t.\t+\t.\tID=OLUR_00000039-RA:exon:24;Parent=OLUR_00000039-RA;\n", "Contig0\tmaker\texon\t34306\t34366\t.\t+\t.\tID=OLUR_00000039-RA:exon:25;Parent=OLUR_00000039-RA;\n", "\n", "Confirming sort order of Olurida_v081-20190709.exon.sorted.gff3:\n", "\n", "##gff-version 3\n", "Contig0\n", "Contig1\n", "Contig100\n", "Contig1000\n", "Contig10000\n", "Contig100011\n", "Contig100018\n", "Contig100052\n", "Contig100054\n", "\n", "Previewing Olurida_v081-20190709.gene.sorted.gff3:\n", "\n", "##gff-version 3\n", "Contig0\tmaker\tgene\t12497\t93068\t.\t+\t.\tID=OLUR_00000039;Name=OLUR_00000039;Alias=maker-Contig0-snap-gene-0.8;Note=Similar to WDR87: WD repeat-containing protein 87 (Homo sapiens OX%3D9606);Dbxref=Coils:Coil,Gene3D:G3DSA:1.25.10.10,Gene3D:G3DSA:2.130.10.10,InterPro:IPR001680,InterPro:IPR011989,InterPro:IPR015943,InterPro:IPR016024,InterPro:IPR017986,InterPro:IPR036322,MobiDBLite:mobidb-lite,Pfam:PF00400,ProSiteProfiles:PS50082,ProSiteProfiles:PS50294,SMART:SM00320,SUPERFAMILY:SSF48371,SUPERFAMILY:SSF50978;Ontology_term=GO:0005515;\n", "Contig0\tmaker\tgene\t95722\t104318\t.\t+\t.\tID=OLUR_00000040;Name=OLUR_00000040;Alias=maker-Contig0-snap-gene-0.9;Note=Similar to SSB: Lupus La protein homolog (Bos taurus OX%3D9913);Dbxref=CDD:cd12291,Gene3D:G3DSA:1.10.10.10,Gene3D:G3DSA:3.30.70.330,InterPro:IPR000504,InterPro:IPR002344,InterPro:IPR006630,InterPro:IPR012677,InterPro:IPR014886,InterPro:IPR035979,InterPro:IPR036388,InterPro:IPR036390,MobiDBLite:mobidb-lite,PRINTS:PR00302,Pfam:PF00076,Pfam:PF05383,Pfam:PF08777,ProSiteProfiles:PS50102,ProSiteProfiles:PS50961,SMART:SM00360,SMART:SM00715,SUPERFAMILY:SSF46785,SUPERFAMILY:SSF54928;Ontology_term=GO:0003676,GO:0003723,GO:0005634,GO:0006396,GO:1990904;\n", "Contig0\tmaker\tgene\t115731\t116730\t.\t+\t.\tID=OLUR_00000041;Name=OLUR_00000041;Alias=maker-Contig0-snap-gene-0.10;Note=Protein of unknown function;Dbxref=CDD:cd00096,Gene3D:G3DSA:2.60.40.10,InterPro:IPR013783,InterPro:IPR036179,SUPERFAMILY:SSF48726;\n", "Contig1\tmaker\tgene\t7358\t13988\t.\t+\t.\tID=OLUR_00000198;Name=OLUR_00000198;Alias=maker-Contig1-snap-gene-0.10;Note=Similar to GPATCH3: G patch domain-containing protein 3 (Homo sapiens OX%3D9606);Dbxref=InterPro:IPR000467,Pfam:PF01585,ProSiteProfiles:PS50174,SMART:SM00443;Ontology_term=GO:0003676;\n", "Contig1\tmaker\tgene\t33348\t47451\t.\t-\t.\tID=OLUR_00000199;Name=OLUR_00000199;Alias=maker-Contig1-snap-gene-0.13;Note=Similar to CAMKK1: Calcium/calmodulin-dependent protein kinase kinase 1 (Homo sapiens OX%3D9606);Dbxref=Gene3D:G3DSA:1.10.510.10,InterPro:IPR000719,InterPro:IPR011009,MobiDBLite:mobidb-lite,Pfam:PF00069,ProSiteProfiles:PS50011,SMART:SM00220,SUPERFAMILY:SSF56112;Ontology_term=GO:0004672,GO:0005524,GO:0006468;\n", "Contig1\tmaker\tgene\t37655\t62707\t.\t+\t.\tID=OLUR_00000200;Name=OLUR_00000200;Alias=maker-Contig1-snap-gene-0.11;Note=Similar to PTH1R: Parathyroid hormone/parathyroid hormone-related peptide receptor (Didelphis virginiana OX%3D9267);Dbxref=Gene3D:G3DSA:1.20.1070.10,Gene3D:G3DSA:4.10.1240.10,InterPro:IPR000832,InterPro:IPR017981,InterPro:IPR017983,InterPro:IPR036445,MobiDBLite:mobidb-lite,PRINTS:PR00249,Pfam:PF00002,ProSitePatterns:PS00650,ProSiteProfiles:PS50261;Ontology_term=GO:0004888,GO:0004930,GO:0007166,GO:0007186,GO:0016020,GO:0016021;\n", "Contig1\tmaker\tgene\t74421\t78081\t.\t+\t.\tID=OLUR_00000201;Name=OLUR_00000201;Alias=maker-Contig1-snap-gene-0.12;Note=Protein of unknown function;Dbxref=CDD:cd00033,Gene3D:G3DSA:2.10.70.10,InterPro:IPR000436,InterPro:IPR035976,MobiDBLite:mobidb-lite,Pfam:PF00084,ProSiteProfiles:PS50923,SMART:SM00032,SUPERFAMILY:SSF57535;\n", "Contig100\tmaker\tgene\t39284\t49191\t.\t-\t.\tID=OLUR_00001175;Name=OLUR_00001175;Alias=maker-Contig100-snap-gene-0.7;Note=Protein of unknown function;Dbxref=Coils:Coil,Gene3D:G3DSA:3.40.50.10140,InterPro:IPR000157,InterPro:IPR035897,MobiDBLite:mobidb-lite,Pfam:PF13676,SUPERFAMILY:SSF52200;Ontology_term=GO:0005515,GO:0007165;\n", "Contig100\tmaker\tgene\t50322\t57773\t.\t-\t.\tID=OLUR_00001176;Name=OLUR_00001176;Alias=maker-Contig100-snap-gene-0.6;Note=Protein of unknown function;\n", "\n", "Confirming sort order of Olurida_v081-20190709.gene.sorted.gff3:\n", "\n", "##gff-version 3\n", "Contig0\n", "Contig1\n", "Contig100\n", "Contig1000\n", "Contig10000\n", "Contig100011\n", "Contig100018\n", "Contig100052\n", "Contig100054\n", "\n" ] } ], "source": [ "%%bash\n", "gff_array=(\"${exon_gff}\" \"${gene_gff}\")\n", "sorted_gff_array=(\"${exon_gff_sorted}\" \"${gene_gff_sorted}\")\n", "\n", "for index in \"${!gff_array[@]}\"\n", "do\n", " { awk 'NR<2 {print}' \"${gff_array[index]}\"\n", " awk 'NR>1 {print}' \"${gff_array[index]}\" | \"${bedtools_dir}\"/bedtools sort -i -\n", " } >> \"${sorted_gff_array[index]}\"\n", " \n", " # Check out sorted GFFs\n", " echo \"Previewing ${sorted_gff_array[index]}:\"\n", " echo \"\"\n", " head \"${sorted_gff_array[index]}\"\n", " echo \"\"\n", " echo \"Confirming sort order of ${sorted_gff_array[index]}:\"\n", " echo \"\"\n", " cut -f1 \"${sorted_gff_array[index]}\" | uniq | head\n", " echo \"\"\n", "done" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create intergenic BED file" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Previewing Olurida_v081-20190709.intergenic.bed:\n", "\n", "Contig0\t0\t12496\n", "Contig0\t93068\t95721\n", "Contig0\t104318\t115730\n", "Contig0\t116730\t116746\n", "Contig1\t0\t7357\n", "Contig1\t13988\t33347\n", "Contig1\t62707\t74420\n", "Contig1\t78081\t87411\n", "Contig10\t0\t79314\n", "Contig100\t0\t39283\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Error: Sorted input specified, but the file Olurida_v081-20190709.gene.sorted.gff3 has the following out of order record\n", "Contig0\tmaker\tgene\t12497\t93068\t.\t+\t.\tID=OLUR_00000039;Name=OLUR_00000039;Alias=maker-Contig0-snap-gene-0.8;Note=Similar to WDR87: WD repeat-containing protein 87 (Homo sapiens OX%3D9606);Dbxref=Coils:Coil,Gene3D:G3DSA:1.25.10.10,Gene3D:G3DSA:2.130.10.10,InterPro:IPR001680,InterPro:IPR011989,InterPro:IPR015943,InterPro:IPR016024,InterPro:IPR017986,InterPro:IPR036322,MobiDBLite:mobidb-lite,Pfam:PF00400,ProSiteProfiles:PS50082,ProSiteProfiles:PS50294,SMART:SM00320,SUPERFAMILY:SSF48371,SUPERFAMILY:SSF50978;Ontology_term=GO:0005515;\n" ] } ], "source": [ "%%bash\n", "\n", "\"${bedtools_dir}\"/complementBed \\\n", "-i \"${gene_gff_sorted}\" \\\n", "-g \"${chrome_sizes}\" \\\n", "> \"${intergenic_bed}\"\n", "\n", "echo \"Previewing ${intergenic_bed}:\"\n", "echo \"\"\n", "head \"${intergenic_bed}\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Make intron BED file" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Previewing Olurida_v081-20190709.exon.sorted.comp.bed:\n", "\n", "Contig0\t0\t12496\n", "Contig0\t12571\t13174\n", "Contig0\t13233\t16208\n", "Contig0\t16373\t18856\n", "Contig0\t18959\t26492\n", "Contig0\t26574\t27805\n", "Contig0\t27862\t28509\n", "Contig0\t28590\t28734\n", "Contig0\t28775\t34305\n", "Contig0\t34366\t35487\n", "\n", "---------------------\n", "\n", "Previewing Olurida_v081-20190709.introns.bed:\n", "\n", "Contig0\t12572\t13174\n", "Contig0\t13234\t16208\n", "Contig0\t18960\t26492\n", "Contig0\t26575\t27805\n", "Contig0\t27863\t28509\n", "Contig0\t28591\t28734\n", "Contig0\t34367\t35487\n", "Contig0\t35641\t36030\n", "Contig0\t36139\t36864\n", "Contig0\t36941\t42921\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Error: Sorted input specified, but the file Olurida_v081-20190709.exon.sorted.gff3 has the following out of order record\n", "Contig0\tmaker\texon\t12497\t12571\t.\t+\t.\tID=OLUR_00000039-RA:exon:17;Parent=OLUR_00000039-RA;\n" ] } ], "source": [ "%%bash\n", "\"${bedtools_dir}\"/complementBed \\\n", "-i \"${exon_gff_sorted}\" \\\n", "-g \"${chrome_sizes}\" \\\n", "> \"${exon_comp_bed}\"\n", "\n", "echo \"Previewing ${exon_comp_bed}:\"\n", "echo \"\"\n", "head \"${exon_comp_bed}\"\n", "echo \"\"\n", "echo \"---------------------\"\n", "\n", "\"${bedtools_dir}\"/intersectBed \\\n", "-a \"${gene_gff_sorted}\" \\\n", "-b \"${exon_comp_bed}\" \\\n", "| awk -v OFS='\\t' '{print $1,$4,$5}' \\\n", "> \"${intron_bed}\"\n", "\n", "echo \"\"\n", "echo \"Previewing ${intron_bed}:\"\n", "echo \"\"\n", "head \"${intron_bed}\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cleanup" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 16M\n", "-rw-r--r-- 1 samb samb 1.6M Dec 17 07:44 Olurida_v081-20190709.five_prime_UTR.gff3\n", "-rw-r--r-- 1 samb samb 1.5M Dec 17 07:44 Olurida_v081-20190709.three_prime_UTR.gff3\n", "-rw-r--r-- 1 samb samb 3.7M Dec 17 07:51 Olurida_v081-20190709.intergenic.bed\n", "-rw-r--r-- 1 samb samb 6.3M Dec 17 08:09 Olurida_v081-20190709.introns.bed\n", "-rw-r--r-- 1 samb samb 2.7M Dec 17 08:15 Olurida_v081.sizes.txt\n" ] } ], "source": [ "%%bash\n", "rm \"${maker_gff}\" \\\n", "\"${exon_comp_bed}\" \\\n", "\"${exon_gff}\" \\\n", "\"${exon_gff_sorted}\" \\\n", "\"${gene_gff}\" \\\n", "\"${gene_gff_sorted}\" \\\n", "\"${fa_index}\"\n", "ls -ltrh" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.5" } }, "nbformat": 4, "nbformat_minor": 4 }