{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Thu Sep 5 14:17:02 PDT 2019\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.6 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "swoose\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 24\n", "On-line CPU(s) list: 0-23\n", "Thread(s) per core: 2\n", "Core(s) per socket: 6\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 44\n", "Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz\n", "Stepping: 2\n", "CPU MHz: 2925.993\n", "BogoMIPS: 5851.93\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 12288K\n", "NUMA node0 CPU(s): 0-23\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp pti tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 70G 15G 41G 666M 13G 53G\n", "Swap: 4.7G 373M 4.3G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set variables\n", "`%env` variables are good for passing to bash cells" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: wd=/home/sam/analyses/20190905_pgen_v074.a3_repeats_counts\n", "env: rysnc_owl=owl:/volume1/web/halfshell/genomic-databank/\n", "env: gff=Panopea-generosa-vv0.74.a3.TE.gff3\n", "env: wget_gffs=--directory-prefix=$/home/sam/analyses/20190905_pgen_v074.a3_repeats_counts --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a3.TE.gff3' https://owl.fish.washington.edu/halfshell/genomic-databank/\n" ] } ], "source": [ "%env wd=/home/sam/analyses/20190905_pgen_v074.a3_repeats_counts\n", "wd=\"/home/sam/analyses/20190905_pgen_v074.a3_repeats_counts\"\n", "\n", "%env rysnc_owl=owl:/volume1/web/halfshell/genomic-databank/\n", "%env gff=Panopea-generosa-vv0.74.a3.TE.gff3\n", "%env wget_gffs=--directory-prefix=${wd} --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a3.TE.gff3' https://owl.fish.washington.edu/halfshell/genomic-databank/\n", "\n", "# Set genome size to 942Mbp\n", "GENOME_SIZE = 942000000\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# Calculate percentage of genome comprised of a given feature\n", "def ind_repeats_percent(feature_length_sum): \n", " return round(float(feature_length_sum / GENOME_SIZE * 100), 2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import Python modules" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import fnmatch\n", "import os\n", "import pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create necessary directories" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash\n", "mkdir --parents ${wd}" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/sam/analyses/20190905_pgen_v074.a3_repeats_counts\n" ] } ], "source": [ "cd {wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Download _Panopea generosa_ GFFs for v074.a3.\n", "\n", "Info on GFFs is here: [https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome-feature-tracks-3)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "receiving incremental file list\n", "./\n", "Panopea-generosa-vv0.74.a3.TE.gff3\n", " 160,525,914 100% 986.11kB/s 0:02:38 (xfr#1, to-chk=0/2)\n", "\n", "sent 80 bytes received 160,545,654 bytes 564,308.38 bytes/sec\n", "total size is 160,525,914 speedup is 1.00\n", "\n", "\n", "----------------------------------------------------------\n", "total 154M\n", "-rw-rw-r-- 1 sam users 154M Sep 5 08:48 Panopea-generosa-vv0.74.a3.TE.gff3\n" ] } ], "source": [ "%%bash\n", "\n", "rsync \\\n", "--archive \\\n", "--verbose \\\n", "--progress \\\n", "--include=\"${gff}\" \\\n", "--exclude=\"*\" \\\n", "\"${rysnc_owl}\" \\\n", ".\n", "\n", "echo \"\"\n", "echo \"\"\n", "echo \"----------------------------------------------------------\"\n", "\n", "ls -lh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### If need to download via wget, uncomment lines in the cell below" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# %%bash\n", "# time \\\n", "# wget \"${wget_gffs}\"\n", "\n", "# ls -lh ${wd}" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "##gff-version 3\n", "##Generated using GenSAS, Tuesday 9th of July 2019 09:21:16 PM\n", "##Project Name : Pgenerosa_v074\n", "##Job Name : RepeatMasker\n", "##Tool : RepeatMasker 4.0.7\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t1876\t1934\t12\t+\t.\tID=19534.GS22252505.PGEN_.repeat00000002;Name=19534.GS22252505.PGEN_.repeat00000002;repeat_match=%28ATTC%29n;repeat_class=Simple_repeat;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t2962\t3024\t249\t-\t.\tID=19534.GS22252505.PGEN_.repeat00000003;Name=19534.GS22252505.PGEN_.repeat00000003;repeat_match=HalSINE1;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t8054\t8084\t28\t+\t.\tID=19534.GS22252505.PGEN_.repeat00000004;Name=19534.GS22252505.PGEN_.repeat00000004;repeat_match=%28CGT%29n;repeat_class=Simple_repeat;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t8318\t8614\t1430\t+\t.\tID=19534.GS22252505.PGEN_.repeat00000005;Name=19534.GS22252505.PGEN_.repeat00000005;repeat_match=BivaMeta-SINE1_HyCu;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t8572\t8621\t279\t+\t.\tID=19534.GS22252505.PGEN_.repeat00000006;Name=19534.GS22252505.PGEN_.repeat00000006;repeat_match=BivaV-SINE1_BaAz;repeat_class=Unspecified;\n" ] } ], "source": [ "%%bash\n", "head ${gff}" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parsing DNA from Panopea-generosa-vv0.74.a3.TE.gff3...\n", "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.DNA.gff3\n", "Parsing matching feature lines for DNA feature...\n", "Done with parsing DNA.\n", "Identified 38612 DNA features.\n", "Output file is: Panopea-generosa-vv0.74.a3.repeats.DNA.gff3\n", "\n", "Parsing LINE from Panopea-generosa-vv0.74.a3.TE.gff3...\n", "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.LINE.gff3\n", "Parsing matching feature lines for LINE feature...\n", "Done with parsing LINE.\n", "Identified 73797 LINE features.\n", "Output file is: Panopea-generosa-vv0.74.a3.repeats.LINE.gff3\n", "\n", "Parsing LTR from Panopea-generosa-vv0.74.a3.TE.gff3...\n", "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.LTR.gff3\n", "Parsing matching feature lines for LTR feature...\n", "Done with parsing LTR.\n", "Identified 11752 LTR features.\n", "Output file is: Panopea-generosa-vv0.74.a3.repeats.LTR.gff3\n", "\n", "Parsing SINE from Panopea-generosa-vv0.74.a3.TE.gff3...\n", "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.SINE.gff3\n", "Parsing matching feature lines for SINE feature...\n", "Done with parsing SINE.\n", "Identified 146416 SINE features.\n", "Output file is: Panopea-generosa-vv0.74.a3.repeats.SINE.gff3\n", "\n", "Parsing Simple_repeat from Panopea-generosa-vv0.74.a3.TE.gff3...\n", "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.Simple_repeat.gff3\n", "Parsing matching feature lines for Simple_repeat feature...\n", "Done with parsing Simple_repeat.\n", "Identified 299997 Simple_repeat features.\n", "Output file is: Panopea-generosa-vv0.74.a3.repeats.Simple_repeat.gff3\n", "\n", "Parsing Unknown from Panopea-generosa-vv0.74.a3.TE.gff3...\n", "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.Unknown.gff3\n", "Parsing matching feature lines for Unknown feature...\n", "Done with parsing Unknown.\n", "Identified 1465471 Unknown features.\n", "Output file is: Panopea-generosa-vv0.74.a3.repeats.Unknown.gff3\n", "\n", "----------------------------------------------\n", "\n", "total 1.1G\n", "-rw-rw-r-- 1 sam users 9.6M Sep 5 15:27 Panopea-generosa-vv0.74.a3.repeats.DNA.gff3\n", "-rw-rw-r-- 1 sam users 19M Sep 5 15:27 Panopea-generosa-vv0.74.a3.repeats.LINE.gff3\n", "-rw-rw-r-- 1 sam users 2.9M Sep 5 15:27 Panopea-generosa-vv0.74.a3.repeats.LTR.gff3\n", "-rw-rw-r-- 1 sam users 73M Sep 5 15:27 Panopea-generosa-vv0.74.a3.repeats.Simple_repeat.gff3\n", "-rw-rw-r-- 1 sam users 37M Sep 5 15:27 Panopea-generosa-vv0.74.a3.repeats.SINE.gff3\n", "-rw-rw-r-- 1 sam users 356M Sep 5 15:27 Panopea-generosa-vv0.74.a3.repeats.Unknown.gff3\n", "-rw-rw-r-- 1 sam users 550M Sep 5 15:13 Panopea-generosa-vv0.74.a3.TE.gff3\n" ] } ], "source": [ "%%bash\n", "\n", "# Set array of GFF features to parse out\n", "gff_features=(DNA LINE LTR SINE Simple_repeat Unknown)\n", "\n", "# Loop through array and create new GFFs from each feature\n", "for feature in ${gff_features[@]}\n", "do\n", " echo \"Parsing ${feature} from ${gff}...\"\n", " echo \"Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.${feature}.gff3\"\n", " head -n 5 ${gff} > Panopea-generosa-vv0.74.a3.repeats.${feature}.gff3\n", " echo \"Parsing matching feature lines for ${feature} feature...\"\n", " grep ${feature} ${gff} >> Panopea-generosa-vv0.74.a3.repeats.${feature}.gff3\n", " echo \"Done with parsing ${feature}.\"\n", " feature_count=$(tail --lines +6 Panopea-generosa-vv0.74.a3.repeats.${feature}.gff3 | wc -l)\n", " echo \"Identified ${feature_count} ${feature} features.\"\n", " echo \"Output file is: Panopea-generosa-vv0.74.a3.repeats.${feature}.gff3\"\n", " echo \"\"\n", "done\n", "\n", "echo \"----------------------------------------------\"\n", "echo \"\"\n", "ls -lh\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check the output files" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "Panopea-generosa-vv0.74.a3.repeats.DNA.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 15th of July 2019 06:30:16 AM\n", "##Project Name : Pgenerosa_v074\n", "##Job Name : Masked Repeat Consensus\n", "##Tool : Mask Sequence Consensus\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t60243\t60322\t325\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000029;Name=19534.GS22252505.PGEN_.repeat00000030;repeat_match=DNA2-25_CGi;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t99944\t100057\t231\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000070;Name=19534.GS22252505.PGEN_.repeat00000071;repeat_match=DNA-8-3_HM;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t99996\t100093\t243\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000072;Name=19534.GS22252505.PGEN_.repeat00000073;repeat_match=DNA-8-3_HM;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t175568\t175784\t354\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000125;Name=19534.GS22252505.PGEN_.repeat00000126;repeat_match=DNA6-7_CGi;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t176798\t177015\t413\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000126;Name=19534.GS22252505.PGEN_.repeat00000127;repeat_match=DNA6-7_CGi;repeat_class=Unspecified;\n", "\n", "\n", "Panopea-generosa-vv0.74.a3.repeats.LINE.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 15th of July 2019 06:30:16 AM\n", "##Project Name : Pgenerosa_v074\n", "##Job Name : Masked Repeat Consensus\n", "##Tool : Mask Sequence Consensus\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t9841029\t9841128\t231\t-\t.\tID=19647.GS22252505.PGEN_.repeat00006523;Name=19534.GS22252505.PGEN_.repeat00006524;repeat_match=LINE-1_AA;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t29525247\t29525307\t228\t-\t.\tID=19647.GS22252505.PGEN_.repeat00018917;Name=19534.GS22252505.PGEN_.repeat00018918;repeat_match=LINE-1_AA;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t53452706\t53452792\t230\t-\t.\tID=19647.GS22252505.PGEN_.repeat00033564;Name=19534.GS22252505.PGEN_.repeat00033565;repeat_match=LINE-1_AA;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t62496090\t62496144\t227\t-\t.\tID=19647.GS22252505.PGEN_.repeat00039216;Name=19534.GS22252505.PGEN_.repeat00039217;repeat_match=LINER1;repeat_class=Unspecified;\n", "PGA_scaffold5__109_contigs__length_67248332\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t9822634\t9822692\t241\t+\t.\tID=19647.GS22252509.PGEN_.repeat00190759;Name=19534.GS22252509.PGEN_.repeat00190760;repeat_match=LINE-1_AA;repeat_class=Unspecified;\n", "\n", "\n", "Panopea-generosa-vv0.74.a3.repeats.LTR.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 15th of July 2019 06:30:16 AM\n", "##Project Name : Pgenerosa_v074\n", "##Job Name : Masked Repeat Consensus\n", "##Tool : Mask Sequence Consensus\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t1038677\t1038733\t240\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000714;Name=19534.GS22252505.PGEN_.repeat00000715;repeat_match=BURRO3_LTR;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t1060569\t1060608\t255\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000735;Name=19534.GS22252505.PGEN_.repeat00000736;repeat_match=Gypsy-21_PBa-LTR;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t1265345\t1265753\t259\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000872;Name=19534.GS22252505.PGEN_.repeat00000873;repeat_match=Gypsy-31_SM-LTR;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t1555555\t1555840\t277\t-\t.\tID=19647.GS22252505.PGEN_.repeat00001054;Name=19534.GS22252505.PGEN_.repeat00001055;repeat_match=Gypsy-7B_LVa-LTR;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t1655290\t1655412\t242\t-\t.\tID=19647.GS22252505.PGEN_.repeat00001099;Name=19534.GS22252505.PGEN_.repeat00001100;repeat_match=Gypsy-30_SM-LTR;repeat_class=Unspecified;\n", "\n", "\n", "Panopea-generosa-vv0.74.a3.repeats.Simple_repeat.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 15th of July 2019 06:30:16 AM\n", "##Project Name : Pgenerosa_v074\n", "##Job Name : Masked Repeat Consensus\n", "##Tool : Mask Sequence Consensus\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t1876\t1934\t12\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000001;Name=19534.GS22252505.PGEN_.repeat00000002;repeat_match=%28ATTC%29n;repeat_class=Simple_repeat;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t8054\t8084\t28\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000003;Name=19534.GS22252505.PGEN_.repeat00000004;repeat_match=%28CGT%29n;repeat_class=Simple_repeat;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t9671\t9700\t13\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000006;Name=19534.GS22252505.PGEN_.repeat00000007;repeat_match=%28ACGG%29n;repeat_class=Simple_repeat;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t14388\t14424\t21\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000008;Name=19534.GS22252505.PGEN_.repeat00000009;repeat_match=%28ACAGACG%29n;repeat_class=Simple_repeat;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t18639\t18670\t12\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000009;Name=19534.GS22252505.PGEN_.repeat00000010;repeat_match=%28AGGGGG%29n;repeat_class=Simple_repeat;\n", "\n", "\n", "Panopea-generosa-vv0.74.a3.repeats.SINE.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 15th of July 2019 06:30:16 AM\n", "##Project Name : Pgenerosa_v074\n", "##Job Name : Masked Repeat Consensus\n", "##Tool : Mask Sequence Consensus\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t2962\t3024\t249\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000002;Name=19534.GS22252505.PGEN_.repeat00000003;repeat_match=HalSINE1;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t8318\t8614\t1430\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000004;Name=19534.GS22252505.PGEN_.repeat00000005;repeat_match=BivaMeta-SINE1_HyCu;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t8572\t8621\t279\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000005;Name=19534.GS22252505.PGEN_.repeat00000006;repeat_match=BivaV-SINE1_BaAz;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t20530\t20623\t308\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000010;Name=19534.GS22252505.PGEN_.repeat00000011;repeat_match=BivaV-SINE1_BaAz;repeat_class=Unspecified;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t20897\t21090\t519\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000011;Name=19534.GS22252505.PGEN_.repeat00000012;repeat_match=BivaMeta-SINE1_HyCu;repeat_class=Unspecified;\n", "\n", "\n", "Panopea-generosa-vv0.74.a3.repeats.Unknown.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 15th of July 2019 06:30:16 AM\n", "##Project Name : Pgenerosa_v074\n", "##Job Name : Masked Repeat Consensus\n", "##Tool : Mask Sequence Consensus\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t265\t338\t334\t+\t.\tID=19647.GS22252505.PGEN_.repeat00632680;Name=19535.GS22252505.PGEN_.repeat00000002;repeat_match=rnd-5_family-367;repeat_class=Unknown;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t855\t1050\t451\t+\t.\tID=19647.GS22252505.PGEN_.repeat00632681;Name=19535.GS22252505.PGEN_.repeat00000003;repeat_match=rnd-5_family-1818;repeat_class=Unknown;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1040\t1183\t651\t-\t.\tID=19647.GS22252505.PGEN_.repeat00632682;Name=19535.GS22252505.PGEN_.repeat00000004;repeat_match=rnd-3_family-335;repeat_class=Unknown;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1618\t1752\t282\t-\t.\tID=19647.GS22252505.PGEN_.repeat00632684;Name=19535.GS22252505.PGEN_.repeat00000006;repeat_match=rnd-6_family-1300;repeat_class=Unknown;\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1764\t1846\t508\t+\t.\tID=19647.GS22252505.PGEN_.repeat00632685;Name=19535.GS22252505.PGEN_.repeat00000007;repeat_match=rnd-1_family-872;repeat_class=Unknown;\n" ] } ], "source": [ "%%bash\n", "# Check the output files\n", "for file in Panopea-generosa-vv0.74.a3.repeats*.gff3\n", "do\n", " echo \"\"\n", " echo \"\"\n", " echo \"${file}\"\n", " echo \"----------------------------------------------\"\n", " head ${file}\n", "done" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# Set list of column header names\n", "gff_header = ['seqid','source','type','start','end','score','strand','phase','attributes']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get sequence length stats for repeat features" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "Panopea-generosa-vv0.74.a3.repeats.DNA.gff3\n", "-------------------------\n", "percent 1.2\n", "sum 11316890.00\n", "mean 293.09\n", "min 1.00\n", "median 154.00\n", "max 7012.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-vv0.74.a3.repeats.Simple_repeat.gff3\n", "-------------------------\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/sam/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3049: DtypeWarning: Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "percent 2.03\n", "sum 19132744.00\n", "mean 63.78\n", "min 1.00\n", "median 41.00\n", "max 12422.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-vv0.74.a3.repeats.LINE.gff3\n", "-------------------------\n", "percent 3.11\n", "sum 29258694.00\n", "mean 396.48\n", "min 11.00\n", "median 227.00\n", "max 6604.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-vv0.74.a3.repeats.LTR.gff3\n", "-------------------------\n", "percent 0.46\n", "sum 4355629.00\n", "mean 370.63\n", "min 1.00\n", "median 276.00\n", "max 6541.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-vv0.74.a3.repeats.SINE.gff3\n", "-------------------------\n", "percent 2.3\n", "sum 21645991.00\n", "mean 147.84\n", "min 1.00\n", "median 142.00\n", "max 934.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-vv0.74.a3.repeats.Unknown.gff3\n", "-------------------------\n", "percent 31.14\n", "sum 2.933161e+08\n", "mean 2.001500e+02\n", "min 1.100000e+01\n", "median 1.450000e+02\n", "max 1.098100e+04\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "-------------------------\n", "Repeats composition of genome (percent): 40.24\n" ] } ], "source": [ "total_repeats_percent = 0\n", "\n", "for file in os.listdir('.'):\n", " if fnmatch.fnmatch(file, 'Panopea-generosa-vv0.74.a3.repeats*.gff3'):\n", " print('\\n' * 2)\n", " print(file)\n", " print(\"-------------------------\")\n", " # Import GFF.\n", " # Skip first row and file is tab-separated\n", " gff=pandas.read_csv(file, header=None, skiprows=5, sep=\"\\t\")\n", " # Rename columns\n", " gff.columns = gff_header\n", " # Subtract start value from end value.\n", " # Have to add 1 so that sequence length can't equal zero\n", " gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1)\n", " gff_sum = gff['seqlength'].sum()\n", " \n", " total_repeats_percent += ind_repeats_percent(gff_sum)\n", " print (\"percent\" , ind_repeats_percent(gff_sum))\n", " \n", " # Apply functions in list to seqlength column\n", " gff_stats = gff['seqlength'].agg(['sum', 'mean', 'min', 'median', 'max'])\n", " \n", " print (gff_stats.round(2))\n", "print('\\n' * 2) \n", "print(\"-------------------------\")\n", "print (\"Repeats composition of genome (percent):\" , total_repeats_percent)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }