{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TODAY'S DATE:\n",
      "Thu Sep  5 14:17:02 PDT 2019\n",
      "------------\n",
      "\n",
      "Distributor ID:\tUbuntu\n",
      "Description:\tUbuntu 16.04.6 LTS\n",
      "Release:\t16.04\n",
      "Codename:\txenial\n",
      "\n",
      "------------\n",
      "HOSTNAME: \n",
      "swoose\n",
      "\n",
      "------------\n",
      "Computer Specs:\n",
      "\n",
      "Architecture:          x86_64\n",
      "CPU op-mode(s):        32-bit, 64-bit\n",
      "Byte Order:            Little Endian\n",
      "CPU(s):                24\n",
      "On-line CPU(s) list:   0-23\n",
      "Thread(s) per core:    2\n",
      "Core(s) per socket:    6\n",
      "Socket(s):             2\n",
      "NUMA node(s):          1\n",
      "Vendor ID:             GenuineIntel\n",
      "CPU family:            6\n",
      "Model:                 44\n",
      "Model name:            Intel(R) Xeon(R) CPU           X5670  @ 2.93GHz\n",
      "Stepping:              2\n",
      "CPU MHz:               2925.993\n",
      "BogoMIPS:              5851.93\n",
      "Virtualization:        VT-x\n",
      "L1d cache:             32K\n",
      "L1i cache:             32K\n",
      "L2 cache:              256K\n",
      "L3 cache:              12288K\n",
      "NUMA node0 CPU(s):     0-23\n",
      "Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp pti tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d\n",
      "\n",
      "------------\n",
      "\n",
      "Memory Specs\n",
      "\n",
      "              total        used        free      shared  buff/cache   available\n",
      "Mem:            70G         15G         41G        666M         13G         53G\n",
      "Swap:          4.7G        373M        4.3G\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No LSB modules are available.\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "echo \"TODAY'S DATE:\"\n",
    "date\n",
    "echo \"------------\"\n",
    "echo \"\"\n",
    "#Display operating system info\n",
    "lsb_release -a\n",
    "echo \"\"\n",
    "echo \"------------\"\n",
    "echo \"HOSTNAME: \"; hostname \n",
    "echo \"\"\n",
    "echo \"------------\"\n",
    "echo \"Computer Specs:\"\n",
    "echo \"\"\n",
    "lscpu\n",
    "echo \"\"\n",
    "echo \"------------\"\n",
    "echo \"\"\n",
    "echo \"Memory Specs\"\n",
    "echo \"\"\n",
    "free -mh"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set variables\n",
    "`%env` variables are good for passing to bash cells"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "env: wd=/home/sam/analyses/20190905_pgen_v074.a3_repeats_counts\n",
      "env: rysnc_owl=owl:/volume1/web/halfshell/genomic-databank/\n",
      "env: gff=Panopea-generosa-vv0.74.a3.TE.gff3\n",
      "env: wget_gffs=--directory-prefix=$/home/sam/analyses/20190905_pgen_v074.a3_repeats_counts --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a3.TE.gff3' https://owl.fish.washington.edu/halfshell/genomic-databank/\n"
     ]
    }
   ],
   "source": [
    "%env wd=/home/sam/analyses/20190905_pgen_v074.a3_repeats_counts\n",
    "wd=\"/home/sam/analyses/20190905_pgen_v074.a3_repeats_counts\"\n",
    "\n",
    "%env rysnc_owl=owl:/volume1/web/halfshell/genomic-databank/\n",
    "%env gff=Panopea-generosa-vv0.74.a3.TE.gff3\n",
    "%env wget_gffs=--directory-prefix=${wd} --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a3.TE.gff3' https://owl.fish.washington.edu/halfshell/genomic-databank/\n",
    "\n",
    "# Set genome size to 942Mbp\n",
    "GENOME_SIZE = 942000000\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate percentage of genome comprised of a given feature\n",
    "def ind_repeats_percent(feature_length_sum): \n",
    "    return round(float(feature_length_sum / GENOME_SIZE * 100), 2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import Python modules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import fnmatch\n",
    "import os\n",
    "import pandas"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Create necessary directories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%%bash\n",
    "mkdir --parents ${wd}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/sam/analyses/20190905_pgen_v074.a3_repeats_counts\n"
     ]
    }
   ],
   "source": [
    "cd {wd}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Download _Panopea generosa_ GFFs for v074.a3.\n",
    "\n",
    "Info on GFFs is here: [https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome-feature-tracks-3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "receiving incremental file list\n",
      "./\n",
      "Panopea-generosa-vv0.74.a3.TE.gff3\n",
      "    160,525,914 100%  986.11kB/s    0:02:38 (xfr#1, to-chk=0/2)\n",
      "\n",
      "sent 80 bytes  received 160,545,654 bytes  564,308.38 bytes/sec\n",
      "total size is 160,525,914  speedup is 1.00\n",
      "\n",
      "\n",
      "----------------------------------------------------------\n",
      "total 154M\n",
      "-rw-rw-r-- 1 sam users 154M Sep  5 08:48 Panopea-generosa-vv0.74.a3.TE.gff3\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "\n",
    "rsync \\\n",
    "--archive \\\n",
    "--verbose \\\n",
    "--progress \\\n",
    "--include=\"${gff}\" \\\n",
    "--exclude=\"*\" \\\n",
    "\"${rysnc_owl}\" \\\n",
    ".\n",
    "\n",
    "echo \"\"\n",
    "echo \"\"\n",
    "echo \"----------------------------------------------------------\"\n",
    "\n",
    "ls -lh"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### If need to download via wget, uncomment lines in the cell below"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# %%bash\n",
    "# time \\\n",
    "# wget \"${wget_gffs}\"\n",
    "\n",
    "# ls -lh ${wd}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "##gff-version 3\n",
      "##Generated using GenSAS, Tuesday 9th of July 2019 09:21:16 PM\n",
      "##Project Name : Pgenerosa_v074\n",
      "##Job Name  : RepeatMasker\n",
      "##Tool      : RepeatMasker 4.0.7\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t1876\t1934\t12\t+\t.\tID=19534.GS22252505.PGEN_.repeat00000002;Name=19534.GS22252505.PGEN_.repeat00000002;repeat_match=%28ATTC%29n;repeat_class=Simple_repeat;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t2962\t3024\t249\t-\t.\tID=19534.GS22252505.PGEN_.repeat00000003;Name=19534.GS22252505.PGEN_.repeat00000003;repeat_match=HalSINE1;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t8054\t8084\t28\t+\t.\tID=19534.GS22252505.PGEN_.repeat00000004;Name=19534.GS22252505.PGEN_.repeat00000004;repeat_match=%28CGT%29n;repeat_class=Simple_repeat;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t8318\t8614\t1430\t+\t.\tID=19534.GS22252505.PGEN_.repeat00000005;Name=19534.GS22252505.PGEN_.repeat00000005;repeat_match=BivaMeta-SINE1_HyCu;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t8572\t8621\t279\t+\t.\tID=19534.GS22252505.PGEN_.repeat00000006;Name=19534.GS22252505.PGEN_.repeat00000006;repeat_match=BivaV-SINE1_BaAz;repeat_class=Unspecified;\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "head ${gff}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Parsing DNA from Panopea-generosa-vv0.74.a3.TE.gff3...\n",
      "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.DNA.gff3\n",
      "Parsing matching feature lines for DNA feature...\n",
      "Done with parsing DNA.\n",
      "Identified 38612 DNA features.\n",
      "Output file is: Panopea-generosa-vv0.74.a3.repeats.DNA.gff3\n",
      "\n",
      "Parsing LINE from Panopea-generosa-vv0.74.a3.TE.gff3...\n",
      "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.LINE.gff3\n",
      "Parsing matching feature lines for LINE feature...\n",
      "Done with parsing LINE.\n",
      "Identified 73797 LINE features.\n",
      "Output file is: Panopea-generosa-vv0.74.a3.repeats.LINE.gff3\n",
      "\n",
      "Parsing LTR from Panopea-generosa-vv0.74.a3.TE.gff3...\n",
      "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.LTR.gff3\n",
      "Parsing matching feature lines for LTR feature...\n",
      "Done with parsing LTR.\n",
      "Identified 11752 LTR features.\n",
      "Output file is: Panopea-generosa-vv0.74.a3.repeats.LTR.gff3\n",
      "\n",
      "Parsing SINE from Panopea-generosa-vv0.74.a3.TE.gff3...\n",
      "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.SINE.gff3\n",
      "Parsing matching feature lines for SINE feature...\n",
      "Done with parsing SINE.\n",
      "Identified 146416 SINE features.\n",
      "Output file is: Panopea-generosa-vv0.74.a3.repeats.SINE.gff3\n",
      "\n",
      "Parsing Simple_repeat from Panopea-generosa-vv0.74.a3.TE.gff3...\n",
      "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.Simple_repeat.gff3\n",
      "Parsing matching feature lines for Simple_repeat feature...\n",
      "Done with parsing Simple_repeat.\n",
      "Identified 299997 Simple_repeat features.\n",
      "Output file is: Panopea-generosa-vv0.74.a3.repeats.Simple_repeat.gff3\n",
      "\n",
      "Parsing Unknown from Panopea-generosa-vv0.74.a3.TE.gff3...\n",
      "Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.Unknown.gff3\n",
      "Parsing matching feature lines for Unknown feature...\n",
      "Done with parsing Unknown.\n",
      "Identified 1465471 Unknown features.\n",
      "Output file is: Panopea-generosa-vv0.74.a3.repeats.Unknown.gff3\n",
      "\n",
      "----------------------------------------------\n",
      "\n",
      "total 1.1G\n",
      "-rw-rw-r-- 1 sam users 9.6M Sep  5 15:27 Panopea-generosa-vv0.74.a3.repeats.DNA.gff3\n",
      "-rw-rw-r-- 1 sam users  19M Sep  5 15:27 Panopea-generosa-vv0.74.a3.repeats.LINE.gff3\n",
      "-rw-rw-r-- 1 sam users 2.9M Sep  5 15:27 Panopea-generosa-vv0.74.a3.repeats.LTR.gff3\n",
      "-rw-rw-r-- 1 sam users  73M Sep  5 15:27 Panopea-generosa-vv0.74.a3.repeats.Simple_repeat.gff3\n",
      "-rw-rw-r-- 1 sam users  37M Sep  5 15:27 Panopea-generosa-vv0.74.a3.repeats.SINE.gff3\n",
      "-rw-rw-r-- 1 sam users 356M Sep  5 15:27 Panopea-generosa-vv0.74.a3.repeats.Unknown.gff3\n",
      "-rw-rw-r-- 1 sam users 550M Sep  5 15:13 Panopea-generosa-vv0.74.a3.TE.gff3\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "\n",
    "# Set array of GFF features to parse out\n",
    "gff_features=(DNA LINE LTR SINE Simple_repeat Unknown)\n",
    "\n",
    "# Loop through array and create new GFFs from each feature\n",
    "for feature in ${gff_features[@]}\n",
    "do\n",
    "    echo \"Parsing ${feature} from ${gff}...\"\n",
    "    echo \"Writing GFF3 header to Panopea-generosa-vv0.74.a3.repeats.${feature}.gff3\"\n",
    "    head -n 5 ${gff} > Panopea-generosa-vv0.74.a3.repeats.${feature}.gff3\n",
    "    echo \"Parsing matching feature lines for ${feature} feature...\"\n",
    "    grep ${feature} ${gff} >> Panopea-generosa-vv0.74.a3.repeats.${feature}.gff3\n",
    "    echo \"Done with parsing ${feature}.\"\n",
    "    feature_count=$(tail --lines +6 Panopea-generosa-vv0.74.a3.repeats.${feature}.gff3 | wc -l)\n",
    "    echo \"Identified ${feature_count} ${feature} features.\"\n",
    "    echo \"Output file is: Panopea-generosa-vv0.74.a3.repeats.${feature}.gff3\"\n",
    "    echo \"\"\n",
    "done\n",
    "\n",
    "echo \"----------------------------------------------\"\n",
    "echo \"\"\n",
    "ls -lh\n",
    "  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check the output files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "Panopea-generosa-vv0.74.a3.repeats.DNA.gff3\n",
      "----------------------------------------------\n",
      "##gff-version 3\n",
      "##Generated using GenSAS, Monday 15th of July 2019 06:30:16 AM\n",
      "##Project Name : Pgenerosa_v074\n",
      "##Job Name  : Masked Repeat Consensus\n",
      "##Tool      : Mask Sequence Consensus\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t60243\t60322\t325\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000029;Name=19534.GS22252505.PGEN_.repeat00000030;repeat_match=DNA2-25_CGi;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t99944\t100057\t231\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000070;Name=19534.GS22252505.PGEN_.repeat00000071;repeat_match=DNA-8-3_HM;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t99996\t100093\t243\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000072;Name=19534.GS22252505.PGEN_.repeat00000073;repeat_match=DNA-8-3_HM;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t175568\t175784\t354\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000125;Name=19534.GS22252505.PGEN_.repeat00000126;repeat_match=DNA6-7_CGi;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t176798\t177015\t413\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000126;Name=19534.GS22252505.PGEN_.repeat00000127;repeat_match=DNA6-7_CGi;repeat_class=Unspecified;\n",
      "\n",
      "\n",
      "Panopea-generosa-vv0.74.a3.repeats.LINE.gff3\n",
      "----------------------------------------------\n",
      "##gff-version 3\n",
      "##Generated using GenSAS, Monday 15th of July 2019 06:30:16 AM\n",
      "##Project Name : Pgenerosa_v074\n",
      "##Job Name  : Masked Repeat Consensus\n",
      "##Tool      : Mask Sequence Consensus\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t9841029\t9841128\t231\t-\t.\tID=19647.GS22252505.PGEN_.repeat00006523;Name=19534.GS22252505.PGEN_.repeat00006524;repeat_match=LINE-1_AA;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t29525247\t29525307\t228\t-\t.\tID=19647.GS22252505.PGEN_.repeat00018917;Name=19534.GS22252505.PGEN_.repeat00018918;repeat_match=LINE-1_AA;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t53452706\t53452792\t230\t-\t.\tID=19647.GS22252505.PGEN_.repeat00033564;Name=19534.GS22252505.PGEN_.repeat00033565;repeat_match=LINE-1_AA;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t62496090\t62496144\t227\t-\t.\tID=19647.GS22252505.PGEN_.repeat00039216;Name=19534.GS22252505.PGEN_.repeat00039217;repeat_match=LINER1;repeat_class=Unspecified;\n",
      "PGA_scaffold5__109_contigs__length_67248332\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t9822634\t9822692\t241\t+\t.\tID=19647.GS22252509.PGEN_.repeat00190759;Name=19534.GS22252509.PGEN_.repeat00190760;repeat_match=LINE-1_AA;repeat_class=Unspecified;\n",
      "\n",
      "\n",
      "Panopea-generosa-vv0.74.a3.repeats.LTR.gff3\n",
      "----------------------------------------------\n",
      "##gff-version 3\n",
      "##Generated using GenSAS, Monday 15th of July 2019 06:30:16 AM\n",
      "##Project Name : Pgenerosa_v074\n",
      "##Job Name  : Masked Repeat Consensus\n",
      "##Tool      : Mask Sequence Consensus\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t1038677\t1038733\t240\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000714;Name=19534.GS22252505.PGEN_.repeat00000715;repeat_match=BURRO3_LTR;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t1060569\t1060608\t255\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000735;Name=19534.GS22252505.PGEN_.repeat00000736;repeat_match=Gypsy-21_PBa-LTR;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t1265345\t1265753\t259\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000872;Name=19534.GS22252505.PGEN_.repeat00000873;repeat_match=Gypsy-31_SM-LTR;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t1555555\t1555840\t277\t-\t.\tID=19647.GS22252505.PGEN_.repeat00001054;Name=19534.GS22252505.PGEN_.repeat00001055;repeat_match=Gypsy-7B_LVa-LTR;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t1655290\t1655412\t242\t-\t.\tID=19647.GS22252505.PGEN_.repeat00001099;Name=19534.GS22252505.PGEN_.repeat00001100;repeat_match=Gypsy-30_SM-LTR;repeat_class=Unspecified;\n",
      "\n",
      "\n",
      "Panopea-generosa-vv0.74.a3.repeats.Simple_repeat.gff3\n",
      "----------------------------------------------\n",
      "##gff-version 3\n",
      "##Generated using GenSAS, Monday 15th of July 2019 06:30:16 AM\n",
      "##Project Name : Pgenerosa_v074\n",
      "##Job Name  : Masked Repeat Consensus\n",
      "##Tool      : Mask Sequence Consensus\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t1876\t1934\t12\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000001;Name=19534.GS22252505.PGEN_.repeat00000002;repeat_match=%28ATTC%29n;repeat_class=Simple_repeat;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t8054\t8084\t28\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000003;Name=19534.GS22252505.PGEN_.repeat00000004;repeat_match=%28CGT%29n;repeat_class=Simple_repeat;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t9671\t9700\t13\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000006;Name=19534.GS22252505.PGEN_.repeat00000007;repeat_match=%28ACGG%29n;repeat_class=Simple_repeat;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t14388\t14424\t21\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000008;Name=19534.GS22252505.PGEN_.repeat00000009;repeat_match=%28ACAGACG%29n;repeat_class=Simple_repeat;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t18639\t18670\t12\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000009;Name=19534.GS22252505.PGEN_.repeat00000010;repeat_match=%28AGGGGG%29n;repeat_class=Simple_repeat;\n",
      "\n",
      "\n",
      "Panopea-generosa-vv0.74.a3.repeats.SINE.gff3\n",
      "----------------------------------------------\n",
      "##gff-version 3\n",
      "##Generated using GenSAS, Monday 15th of July 2019 06:30:16 AM\n",
      "##Project Name : Pgenerosa_v074\n",
      "##Job Name  : Masked Repeat Consensus\n",
      "##Tool      : Mask Sequence Consensus\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t2962\t3024\t249\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000002;Name=19534.GS22252505.PGEN_.repeat00000003;repeat_match=HalSINE1;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t8318\t8614\t1430\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000004;Name=19534.GS22252505.PGEN_.repeat00000005;repeat_match=BivaMeta-SINE1_HyCu;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t8572\t8621\t279\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000005;Name=19534.GS22252505.PGEN_.repeat00000006;repeat_match=BivaV-SINE1_BaAz;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t20530\t20623\t308\t-\t.\tID=19647.GS22252505.PGEN_.repeat00000010;Name=19534.GS22252505.PGEN_.repeat00000011;repeat_match=BivaV-SINE1_BaAz;repeat_class=Unspecified;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d250896def4c-repeatmasker\trepeat_region\t20897\t21090\t519\t+\t.\tID=19647.GS22252505.PGEN_.repeat00000011;Name=19534.GS22252505.PGEN_.repeat00000012;repeat_match=BivaMeta-SINE1_HyCu;repeat_class=Unspecified;\n",
      "\n",
      "\n",
      "Panopea-generosa-vv0.74.a3.repeats.Unknown.gff3\n",
      "----------------------------------------------\n",
      "##gff-version 3\n",
      "##Generated using GenSAS, Monday 15th of July 2019 06:30:16 AM\n",
      "##Project Name : Pgenerosa_v074\n",
      "##Job Name  : Masked Repeat Consensus\n",
      "##Tool      : Mask Sequence Consensus\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t265\t338\t334\t+\t.\tID=19647.GS22252505.PGEN_.repeat00632680;Name=19535.GS22252505.PGEN_.repeat00000002;repeat_match=rnd-5_family-367;repeat_class=Unknown;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t855\t1050\t451\t+\t.\tID=19647.GS22252505.PGEN_.repeat00632681;Name=19535.GS22252505.PGEN_.repeat00000003;repeat_match=rnd-5_family-1818;repeat_class=Unknown;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1040\t1183\t651\t-\t.\tID=19647.GS22252505.PGEN_.repeat00632682;Name=19535.GS22252505.PGEN_.repeat00000004;repeat_match=rnd-3_family-335;repeat_class=Unknown;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1618\t1752\t282\t-\t.\tID=19647.GS22252505.PGEN_.repeat00632684;Name=19535.GS22252505.PGEN_.repeat00000006;repeat_match=rnd-6_family-1300;repeat_class=Unknown;\n",
      "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1764\t1846\t508\t+\t.\tID=19647.GS22252505.PGEN_.repeat00632685;Name=19535.GS22252505.PGEN_.repeat00000007;repeat_match=rnd-1_family-872;repeat_class=Unknown;\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "# Check the output files\n",
    "for file in Panopea-generosa-vv0.74.a3.repeats*.gff3\n",
    "do\n",
    "    echo \"\"\n",
    "    echo \"\"\n",
    "    echo \"${file}\"\n",
    "    echo \"----------------------------------------------\"\n",
    "    head ${file}\n",
    "done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set list of column header names\n",
    "gff_header = ['seqid','source','type','start','end','score','strand','phase','attributes']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get sequence length stats for repeat features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "Panopea-generosa-vv0.74.a3.repeats.DNA.gff3\n",
      "-------------------------\n",
      "percent 1.2\n",
      "sum       11316890.00\n",
      "mean           293.09\n",
      "min              1.00\n",
      "median         154.00\n",
      "max           7012.00\n",
      "Name: seqlength, dtype: float64\n",
      "\n",
      "\n",
      "\n",
      "Panopea-generosa-vv0.74.a3.repeats.Simple_repeat.gff3\n",
      "-------------------------\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sam/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3049: DtypeWarning: Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  interactivity=interactivity, compiler=compiler, result=result)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "percent 2.03\n",
      "sum       19132744.00\n",
      "mean            63.78\n",
      "min              1.00\n",
      "median          41.00\n",
      "max          12422.00\n",
      "Name: seqlength, dtype: float64\n",
      "\n",
      "\n",
      "\n",
      "Panopea-generosa-vv0.74.a3.repeats.LINE.gff3\n",
      "-------------------------\n",
      "percent 3.11\n",
      "sum       29258694.00\n",
      "mean           396.48\n",
      "min             11.00\n",
      "median         227.00\n",
      "max           6604.00\n",
      "Name: seqlength, dtype: float64\n",
      "\n",
      "\n",
      "\n",
      "Panopea-generosa-vv0.74.a3.repeats.LTR.gff3\n",
      "-------------------------\n",
      "percent 0.46\n",
      "sum       4355629.00\n",
      "mean          370.63\n",
      "min             1.00\n",
      "median        276.00\n",
      "max          6541.00\n",
      "Name: seqlength, dtype: float64\n",
      "\n",
      "\n",
      "\n",
      "Panopea-generosa-vv0.74.a3.repeats.SINE.gff3\n",
      "-------------------------\n",
      "percent 2.3\n",
      "sum       21645991.00\n",
      "mean           147.84\n",
      "min              1.00\n",
      "median         142.00\n",
      "max            934.00\n",
      "Name: seqlength, dtype: float64\n",
      "\n",
      "\n",
      "\n",
      "Panopea-generosa-vv0.74.a3.repeats.Unknown.gff3\n",
      "-------------------------\n",
      "percent 31.14\n",
      "sum       2.933161e+08\n",
      "mean      2.001500e+02\n",
      "min       1.100000e+01\n",
      "median    1.450000e+02\n",
      "max       1.098100e+04\n",
      "Name: seqlength, dtype: float64\n",
      "\n",
      "\n",
      "\n",
      "-------------------------\n",
      "Repeats composition of genome (percent): 40.24\n"
     ]
    }
   ],
   "source": [
    "total_repeats_percent = 0\n",
    "\n",
    "for file in os.listdir('.'):\n",
    "    if fnmatch.fnmatch(file, 'Panopea-generosa-vv0.74.a3.repeats*.gff3'):\n",
    "        print('\\n' * 2)\n",
    "        print(file)\n",
    "        print(\"-------------------------\")\n",
    "        # Import GFF.\n",
    "        # Skip first row and file is tab-separated\n",
    "        gff=pandas.read_csv(file, header=None, skiprows=5, sep=\"\\t\")\n",
    "        # Rename columns\n",
    "        gff.columns = gff_header\n",
    "        # Subtract start value from end value.\n",
    "        # Have to add 1 so that sequence length can't equal zero\n",
    "        gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1)\n",
    "        gff_sum = gff['seqlength'].sum()\n",
    "        \n",
    "        total_repeats_percent += ind_repeats_percent(gff_sum)\n",
    "        print (\"percent\" , ind_repeats_percent(gff_sum))\n",
    "        \n",
    "        # Apply functions in list to seqlength column\n",
    "        gff_stats = gff['seqlength'].agg(['sum', 'mean', 'min', 'median', 'max'])\n",
    "        \n",
    "        print (gff_stats.round(2))\n",
    "print('\\n' * 2) \n",
    "print(\"-------------------------\")\n",
    "print (\"Repeats composition of genome (percent):\" , total_repeats_percent)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}