{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Tue Oct 29 08:51:12 PDT 2019\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.6 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "swoose\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 24\n", "On-line CPU(s) list: 0-23\n", "Thread(s) per core: 2\n", "Core(s) per socket: 6\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 44\n", "Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz\n", "Stepping: 2\n", "CPU MHz: 2925.971\n", "BogoMIPS: 5851.97\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 12288K\n", "NUMA node0 CPU(s): 0-23\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 70G 16G 429M 392M 53G 52G\n", "Swap: 4.7G 444K 4.7G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set variables\n", "`%env` variables are good for passing to bash cells" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: wd=/home/sam/analyses/20191029_pgen_v074.a4_repeats_counts\n", "env: rysnc_owl=owl:/volume1/web/halfshell/genomic-databank/\n", "env: gff=Panopea-generosa-vv0.74.a4.repeat_region.gff3\n", "env: wget_gffs=--directory-prefix=$/home/sam/analyses/20191029_pgen_v074.a4_repeats_counts --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a4.repeat_region.gff3' https://owl.fish.washington.edu/halfshell/genomic-databank/\n", "env: new_gff=Panopea-generosa-vv0.74.a4.repeats\n" ] } ], "source": [ "%env wd=/home/sam/analyses/20191029_pgen_v074.a4_repeats_counts\n", "wd=\"/home/sam/analyses/20191029_pgen_v074.a4_repeats_counts\"\n", "gff_header = ['seqid','source','type','start','end','score','strand','phase','attributes']\n", "%env rysnc_owl=owl:/volume1/web/halfshell/genomic-databank/\n", "%env gff=Panopea-generosa-vv0.74.a4.repeat_region.gff3\n", "%env wget_gffs=--directory-prefix=${wd} --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a4.repeat_region.gff3' https://owl.fish.washington.edu/halfshell/genomic-databank/\n", "%env new_gff=Panopea-generosa-vv0.74.a4.repeats\n", "\n", "# Set genome size to 942Mbp\n", "GENOME_SIZE = 942000000\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Calculate percentage of genome comprised of a given feature\n", "def ind_repeats_percent(feature_length_sum): \n", " return round(float(feature_length_sum / GENOME_SIZE * 100), 2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import Python modules" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import fnmatch\n", "import os\n", "import pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create necessary directories" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash\n", "mkdir --parents ${wd}" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/sam/analyses/20191029_pgen_v074.a4_repeats_counts\n" ] } ], "source": [ "cd {wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Download _Panopea generosa_ GFFs for v074.a3.\n", "\n", "Info on GFFs is here: [https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome-feature-tracks-3)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "receiving incremental file list\n", "./\n", "Panopea-generosa-vv0.74.a4.repeat_region.gff3\n", " 390,130,212 100% 28.74MB/s 0:00:12 (xfr#1, to-chk=0/2)\n", "\n", "sent 91 bytes received 390,177,991 bytes 25,172,779.48 bytes/sec\n", "total size is 390,130,212 speedup is 1.00\n", "\n", "\n", "----------------------------------------------------------\n", "total 373M\n", "-rwx------ 1 sam users 373M Oct 14 10:13 Panopea-generosa-vv0.74.a4.repeat_region.gff3\n" ] } ], "source": [ "%%bash\n", "\n", "rsync \\\n", "--archive \\\n", "--verbose \\\n", "--progress \\\n", "--include=\"${gff}\" \\\n", "--exclude=\"*\" \\\n", "\"${rysnc_owl}\" \\\n", ".\n", "\n", "echo \"\"\n", "echo \"\"\n", "echo \"----------------------------------------------------------\"\n", "\n", "ls -lh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### If need to download via wget, uncomment lines in the cell below" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# %%bash\n", "# time \\\n", "# wget \"${wget_gffs}\"\n", "\n", "# ls -lh ${wd}" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1\t225\t1646\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t910\t1325\t3459\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1329\t2039\t5278\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149393;Name=19535.GS22252506.PGEN_.repeat00149393;repeat_match=rnd-6_family-1529;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t3030\t3608\t3232\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149394;Name=19535.GS22252506.PGEN_.repeat00149394;repeat_match=rnd-1_family-330;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t3604\t3693\t512\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149395;Name=19535.GS22252506.PGEN_.repeat00149395;repeat_match=rnd-1_family-278;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t5038\t5265\t1614\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149396;Name=19535.GS22252506.PGEN_.repeat00149396;repeat_match=rnd-5_family-2058;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t5224\t5277\t240\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149397;Name=19535.GS22252506.PGEN_.repeat00149397;repeat_match=rnd-5_family-1533;repeat_class=LINE%2FL2;\n" ] } ], "source": [ "%%bash\n", "head ${gff}" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Unique repeats features in Panopea-generosa-vv0.74.a4.repeat_region.gff3:\n", "DNA\n", "LINE\n", "LTR\n", "RC\n", "Simple_repeat\n", "SINE\n", "Unknown\n", "Parsing DNA from Panopea-generosa-vv0.74.a4.repeat_region.gff3...\n", "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.DNA.gff3\n", "Parsing matching feature lines for DNA feature...\n", "Done with parsing DNA feature.\n", "Identified 21094 DNA features.\n", "Output file is: Panopea-generosa-vv0.74.a3.repeats.DNA.gff3\n", "\n", "Parsing LINE from Panopea-generosa-vv0.74.a4.repeat_region.gff3...\n", "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.LINE.gff3\n", "Parsing matching feature lines for LINE feature...\n", "Done with parsing LINE feature.\n", "Identified 69365 LINE features.\n", "Output file is: Panopea-generosa-vv0.74.a3.repeats.LINE.gff3\n", "\n", "Parsing LTR from Panopea-generosa-vv0.74.a4.repeat_region.gff3...\n", "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.LTR.gff3\n", "Parsing matching feature lines for LTR feature...\n", "Done with parsing LTR feature.\n", "Identified 2890 LTR features.\n", "Output file is: Panopea-generosa-vv0.74.a3.repeats.LTR.gff3\n", "\n", "Parsing RC from Panopea-generosa-vv0.74.a4.repeat_region.gff3...\n", "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.RC.gff3\n", "Parsing matching feature lines for RC feature...\n", "Done with parsing RC feature.\n", "Identified 546 RC features.\n", "Output file is: Panopea-generosa-vv0.74.a3.repeats.RC.gff3\n", "\n", "Parsing Simple_repeat from Panopea-generosa-vv0.74.a4.repeat_region.gff3...\n", "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.Simple_repeat.gff3\n", "Parsing matching feature lines for Simple_repeat feature...\n", "Done with parsing Simple_repeat feature.\n", "Identified 18121 Simple_repeat features.\n", "Output file is: Panopea-generosa-vv0.74.a3.repeats.Simple_repeat.gff3\n", "\n", "Parsing SINE from Panopea-generosa-vv0.74.a4.repeat_region.gff3...\n", "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.SINE.gff3\n", "Parsing matching feature lines for SINE feature...\n", "Done with parsing SINE feature.\n", "Identified 39397 SINE features.\n", "Output file is: Panopea-generosa-vv0.74.a3.repeats.SINE.gff3\n", "\n", "Parsing Unknown from Panopea-generosa-vv0.74.a4.repeat_region.gff3...\n", "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.Unknown.gff3\n", "Parsing matching feature lines for Unknown feature...\n", "Done with parsing Unknown feature.\n", "Identified 1375742 Unknown features.\n", "Output file is: Panopea-generosa-vv0.74.a3.repeats.Unknown.gff3\n", "\n", "----------------------------------------------\n", "\n", "total 745M\n", "-rwx------ 1 sam users 373M Oct 14 10:13 Panopea-generosa-vv0.74.a4.repeat_region.gff3\n", "-rw-rw-r-- 1 sam users 5.4M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.DNA.gff3\n", "-rw-rw-r-- 1 sam users 18M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.LINE.gff3\n", "-rw-rw-r-- 1 sam users 735K Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.LTR.gff3\n", "-rw-rw-r-- 1 sam users 140K Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.RC.gff3\n", "-rw-rw-r-- 1 sam users 4.5M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.Simple_repeat.gff3\n", "-rw-rw-r-- 1 sam users 11M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.SINE.gff3\n", "-rw-rw-r-- 1 sam users 335M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.Unknown.gff3\n" ] } ], "source": [ "%%bash\n", "\n", "# Initialize array\n", "features_array=()\n", "\n", "# Identify unique features in GFF\n", "## Store as an array\n", "## Skip first three header lines, and then cut on two delimiters that are present\n", "echo \"Unique repeats features in ${gff}:\"\n", "while IFS='' read -r line \n", "do\n", " features_array+=(\"$line\")\n", "done < <(awk -F\"class=\" 'NR >3 {print $2}' ${gff} \\\n", " | sort -u \\\n", " | cut -d '%' -f 1 \\\n", " | cut -d ';' -f 1 \\\n", " | uniq)\n", "\n", "# Check array contents\n", "for feature in \"${features_array[@]}\"\n", "do\n", " echo \"${feature}\"\n", "done\n", "\n", "# Loop through array and create new GFFs from each feature\n", "for feature in \"${features_array[@]}\"\n", "do\n", " echo \"Parsing ${feature} from ${gff}...\"\n", " echo \"Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.${feature}.gff3\"\n", " head -n 5 ${gff} > ${new_gff}.${feature}.gff3\n", " echo \"Parsing matching feature lines for ${feature} feature...\"\n", " grep \"${feature}\" ${gff} >> ${new_gff}.${feature}.gff3\n", " echo \"Done with parsing ${feature} feature.\"\n", " feature_count=$(tail --lines +6 ${new_gff}.${feature}.gff3 | wc -l)\n", " echo \"Identified ${feature_count} ${feature} features.\"\n", " echo \"Output file is: Panopea-generosa-vv0.74.a3.repeats.${feature}.gff3\"\n", " echo \"\"\n", "done\n", "\n", "echo \"----------------------------------------------\"\n", "echo \"\"\n", "ls -lh\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check the output files" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "Panopea-generosa-vv0.74.a4.repeats.DNA.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1\t225\t1646\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t910\t1325\t3459\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t75698\t76199\t3502\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149552;Name=19535.GS22252506.PGEN_.repeat00149552;repeat_match=rnd-1_family-397;repeat_class=DNA%2FhAT-hAT5;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t189334\t189466\t440\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149753;Name=19535.GS22252506.PGEN_.repeat00149753;repeat_match=rnd-1_family-580;repeat_class=DNA%2FTcMar-Tc1;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t217948\t218189\t795\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149810;Name=19535.GS22252506.PGEN_.repeat00149810;repeat_match=rnd-6_family-5014;repeat_class=DNA%2FTcMar-Tc1;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t218368\t218586\t1139\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149812;Name=19535.GS22252506.PGEN_.repeat00149812;repeat_match=rnd-6_family-5014;repeat_class=DNA%2FTcMar-Tc1;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t261172\t261266\t561\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149912;Name=19535.GS22252506.PGEN_.repeat00149912;repeat_match=rnd-6_family-9;repeat_class=DNA%2FhAT-Tip100;\n", "\n", "\n", "Panopea-generosa-vv0.74.a4.repeats.LINE.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1\t225\t1646\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t910\t1325\t3459\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t5224\t5277\t240\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149397;Name=19535.GS22252506.PGEN_.repeat00149397;repeat_match=rnd-5_family-1533;repeat_class=LINE%2FL2;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t106383\t106576\t376\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149599;Name=19535.GS22252506.PGEN_.repeat00149599;repeat_match=rnd-5_family-1227;repeat_class=LINE%2FCR1-Zenon;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t106637\t108267\t4210\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149600;Name=19535.GS22252506.PGEN_.repeat00149600;repeat_match=rnd-1_family-321;repeat_class=LINE%2FCR1-Zenon;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t242610\t242911\t1612\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149866;Name=19535.GS22252506.PGEN_.repeat00149866;repeat_match=rnd-1_family-435;repeat_class=LINE%2FRTE-X;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t242916\t243551\t1279\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149867;Name=19535.GS22252506.PGEN_.repeat00149867;repeat_match=rnd-1_family-403;repeat_class=LINE%2FRTE-X;\n", "\n", "\n", "Panopea-generosa-vv0.74.a4.repeats.LTR.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1\t225\t1646\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t910\t1325\t3459\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t3821636\t3823037\t10961\t+\t.\tID=19535.GS22252506.PGEN_.repeat00155489;Name=19535.GS22252506.PGEN_.repeat00155489;repeat_match=rnd-5_family-337;repeat_class=LTR%2FGypsy;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t3823036\t3826380\t26996\t+\t.\tID=19535.GS22252506.PGEN_.repeat00155490;Name=19535.GS22252506.PGEN_.repeat00155490;repeat_match=rnd-5_family-337;repeat_class=LTR%2FGypsy;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t3831155\t3831246\t660\t+\t.\tID=19535.GS22252506.PGEN_.repeat00155515;Name=19535.GS22252506.PGEN_.repeat00155515;repeat_match=rnd-5_family-337;repeat_class=LTR%2FGypsy;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t3831536\t3832007\t3730\t+\t.\tID=19535.GS22252506.PGEN_.repeat00155517;Name=19535.GS22252506.PGEN_.repeat00155517;repeat_match=rnd-5_family-337;repeat_class=LTR%2FGypsy;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t4665540\t4666330\t516\t+\t.\tID=19535.GS22252506.PGEN_.repeat00156951;Name=19535.GS22252506.PGEN_.repeat00156951;repeat_match=rnd-5_family-1106;repeat_class=LTR%2FGypsy;\n", "\n", "\n", "Panopea-generosa-vv0.74.a4.repeats.RC.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1\t225\t1646\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t910\t1325\t3459\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t123062\t123694\t4812\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149637;Name=19535.GS22252506.PGEN_.repeat00149637;repeat_match=rnd-1_family-384;repeat_class=RC%2FHelitron;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t2285720\t2285809\t738\t+\t.\tID=19535.GS22252506.PGEN_.repeat00153160;Name=19535.GS22252506.PGEN_.repeat00153160;repeat_match=rnd-1_family-384;repeat_class=RC%2FHelitron;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t2285873\t2286021\t1221\t+\t.\tID=19535.GS22252506.PGEN_.repeat00153162;Name=19535.GS22252506.PGEN_.repeat00153162;repeat_match=rnd-1_family-384;repeat_class=RC%2FHelitron;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t2286296\t2286843\t4324\t+\t.\tID=19535.GS22252506.PGEN_.repeat00153165;Name=19535.GS22252506.PGEN_.repeat00153165;repeat_match=rnd-1_family-384;repeat_class=RC%2FHelitron;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t2635022\t2635479\t3530\t-\t.\tID=19535.GS22252506.PGEN_.repeat00153957;Name=19535.GS22252506.PGEN_.repeat00153957;repeat_match=rnd-1_family-384;repeat_class=RC%2FHelitron;\n", "\n", "\n", "Panopea-generosa-vv0.74.a4.repeats.Simple_repeat.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1\t225\t1646\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t910\t1325\t3459\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t301637\t301892\t970\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149963;Name=19535.GS22252506.PGEN_.repeat00149963;repeat_match=rnd-4_family-288;repeat_class=Simple_repeat;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t306440\t306805\t460\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149968;Name=19535.GS22252506.PGEN_.repeat00149968;repeat_match=rnd-4_family-288;repeat_class=Simple_repeat;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t484555\t484765\t1048\t+\t.\tID=19535.GS22252506.PGEN_.repeat00150121;Name=19535.GS22252506.PGEN_.repeat00150121;repeat_match=rnd-6_family-10;repeat_class=Simple_repeat;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t550953\t551053\t234\t+\t.\tID=19535.GS22252506.PGEN_.repeat00150239;Name=19535.GS22252506.PGEN_.repeat00150239;repeat_match=rnd-1_family-158;repeat_class=Simple_repeat;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t551620\t551760\t386\t+\t.\tID=19535.GS22252506.PGEN_.repeat00150243;Name=19535.GS22252506.PGEN_.repeat00150243;repeat_match=rnd-1_family-158;repeat_class=Simple_repeat;\n", "\n", "\n", "Panopea-generosa-vv0.74.a4.repeats.SINE.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1\t225\t1646\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t910\t1325\t3459\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t6637\t6844\t1072\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149404;Name=19535.GS22252506.PGEN_.repeat00149404;repeat_match=rnd-1_family-21;repeat_class=SINE%2FtRNA-Core-L2;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t22157\t22354\t1063\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149446;Name=19535.GS22252506.PGEN_.repeat00149446;repeat_match=rnd-1_family-21;repeat_class=SINE%2FtRNA-Core-L2;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t39994\t40185\t1148\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149474;Name=19535.GS22252506.PGEN_.repeat00149474;repeat_match=rnd-3_family-517;repeat_class=SINE%2FtRNA-Core-L2;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t58668\t58862\t729\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149518;Name=19535.GS22252506.PGEN_.repeat00149518;repeat_match=rnd-3_family-517;repeat_class=SINE%2FtRNA-Core-L2;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t110510\t110677\t514\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149613;Name=19535.GS22252506.PGEN_.repeat00149613;repeat_match=rnd-1_family-48;repeat_class=SINE%2FtRNA-Core-L2;\n", "\n", "\n", "Panopea-generosa-vv0.74.a4.repeats.Unknown.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1\t225\t1646\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t910\t1325\t3459\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1\t225\t1646\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149391;Name=19535.GS22252506.PGEN_.repeat00149391;repeat_match=rnd-1_family-39;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t910\t1325\t3459\t+\t.\tID=19535.GS22252506.PGEN_.repeat00149392;Name=19535.GS22252506.PGEN_.repeat00149392;repeat_match=rnd-1_family-135;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1329\t2039\t5278\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149393;Name=19535.GS22252506.PGEN_.repeat00149393;repeat_match=rnd-6_family-1529;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t3030\t3608\t3232\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149394;Name=19535.GS22252506.PGEN_.repeat00149394;repeat_match=rnd-1_family-330;repeat_class=Unknown;\n", "PGA_scaffold2__36_contigs__length_69596280\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t3604\t3693\t512\t-\t.\tID=19535.GS22252506.PGEN_.repeat00149395;Name=19535.GS22252506.PGEN_.repeat00149395;repeat_match=rnd-1_family-278;repeat_class=Unknown;\n" ] } ], "source": [ "%%bash\n", "# Check the output files\n", "for file in \"${new_gff}\"*.gff3\n", "do\n", " echo \"\"\n", " echo \"\"\n", " echo \"${file}\"\n", " echo \"----------------------------------------------\"\n", " head \"${file}\"\n", "done" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get sequence length stats for repeat features" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "Panopea-generosa-vv0.74.a4.repeats.LINE.gff3\n", "-------------------------\n", "percent 2.91\n", "sum 27388849.00\n", "mean 394.85\n", "min 11.00\n", "median 226.00\n", "max 6604.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-vv0.74.a4.repeats.Simple_repeat.gff3\n", "-------------------------\n", "percent 0.5\n", "sum 4733271.0\n", "mean 261.2\n", "min 6.0\n", "median 125.0\n", "max 5981.0\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-vv0.74.a4.repeats.Unknown.gff3\n", "-------------------------\n", "percent 29.09\n", "sum 2.740281e+08\n", "mean 1.991900e+02\n", "min 1.100000e+01\n", "median 1.440000e+02\n", "max 6.574000e+03\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-vv0.74.a4.repeats.LTR.gff3\n", "-------------------------\n", "percent 0.22\n", "sum 2060084.00\n", "mean 712.83\n", "min 11.00\n", "median 316.00\n", "max 6541.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-vv0.74.a4.repeats.RC.gff3\n", "-------------------------\n", "percent 0.02\n", "sum 232303.00\n", "mean 425.46\n", "min 13.00\n", "median 464.00\n", "max 674.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-vv0.74.a4.repeats.SINE.gff3\n", "-------------------------\n", "percent 0.65\n", "sum 6133778.00\n", "mean 155.69\n", "min 11.00\n", "median 164.00\n", "max 934.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-vv0.74.a4.repeats.DNA.gff3\n", "-------------------------\n", "percent 0.91\n", "sum 8602532.00\n", "mean 407.82\n", "min 11.00\n", "median 247.00\n", "max 7012.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "-------------------------\n", "Repeats composition of genome (percent): 34.3\n" ] } ], "source": [ "total_repeats_percent = 0\n", "\n", "for file in os.listdir('.'):\n", " if fnmatch.fnmatch(file, 'Panopea-generosa-vv0.74.a4.repeats*.gff3'):\n", " print('\\n' * 2)\n", " print(file)\n", " print(\"-------------------------\")\n", " # Import GFF.\n", " # Skip first five rows and file is tab-separated\n", " gff=pandas.read_csv(file, header=None, skiprows=5, sep=\"\\t\")\n", " # Rename columns\n", " gff.columns = gff_header\n", " # Subtract start value from end value.\n", " # Have to add 1 so that sequence length can't equal zero\n", " gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1)\n", " gff_sum = gff['seqlength'].sum()\n", " \n", " total_repeats_percent += ind_repeats_percent(gff_sum)\n", " print (\"percent\" , ind_repeats_percent(gff_sum))\n", " \n", " # Apply functions in list to seqlength column\n", " gff_stats = gff['seqlength'].agg(['sum', 'mean', 'min', 'median', 'max'])\n", " \n", " print (gff_stats.round(2))\n", "print('\\n' * 2) \n", "print(\"-------------------------\")\n", "print (\"Repeats composition of genome (percent):\" , total_repeats_percent)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 373M\n", "-rw-rw-r-- 1 sam users 5.4M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.DNA.gff3\n", "-rw-rw-r-- 1 sam users 18M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.LINE.gff3\n", "-rw-rw-r-- 1 sam users 735K Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.LTR.gff3\n", "-rw-rw-r-- 1 sam users 140K Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.RC.gff3\n", "-rw-rw-r-- 1 sam users 4.5M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.Simple_repeat.gff3\n", "-rw-rw-r-- 1 sam users 11M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.SINE.gff3\n", "-rw-rw-r-- 1 sam users 335M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.Unknown.gff3\n" ] } ], "source": [ "%%bash\n", "rm ${gff}\n", "ls -lh" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }