{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Notebook to convert GFF to EMBL format for genome submission to ENA\n",
    "\n",
    "#### Uses [EMBLmyGFF3](https://github.com/NBISweden/EMBLmyGFF3).\n",
    "\n",
    "#### This notebook was run in an Anaconda enironment for `EMBLmyGFF3` (needed to avoid weird dependency conflicts - likely specific to my Anaconda installation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TODAY'S DATE:\n",
      "Thu Mar 12 12:31:06 PDT 2020\n",
      "------------\n",
      "\n",
      "Distributor ID:\tUbuntu\n",
      "Description:\tUbuntu 16.04.6 LTS\n",
      "Release:\t16.04\n",
      "Codename:\txenial\n",
      "\n",
      "------------\n",
      "HOSTNAME: \n",
      "swoose\n",
      "\n",
      "------------\n",
      "Computer Specs:\n",
      "\n",
      "Architecture:          x86_64\n",
      "CPU op-mode(s):        32-bit, 64-bit\n",
      "Byte Order:            Little Endian\n",
      "CPU(s):                24\n",
      "On-line CPU(s) list:   0-23\n",
      "Thread(s) per core:    2\n",
      "Core(s) per socket:    6\n",
      "Socket(s):             2\n",
      "NUMA node(s):          1\n",
      "Vendor ID:             GenuineIntel\n",
      "CPU family:            6\n",
      "Model:                 44\n",
      "Model name:            Intel(R) Xeon(R) CPU           X5670  @ 2.93GHz\n",
      "Stepping:              2\n",
      "CPU MHz:               2925.931\n",
      "BogoMIPS:              5851.96\n",
      "Virtualization:        VT-x\n",
      "L1d cache:             32K\n",
      "L1i cache:             32K\n",
      "L2 cache:              256K\n",
      "L3 cache:              12288K\n",
      "NUMA node0 CPU(s):     0-23\n",
      "Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d\n",
      "\n",
      "------------\n",
      "\n",
      "Memory Specs\n",
      "\n",
      "              total        used        free      shared  buff/cache   available\n",
      "Mem:            70G        3.2G         56G        408M         10G         66G\n",
      "Swap:          4.7G        245M        4.4G\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No LSB modules are available.\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "echo \"TODAY'S DATE:\"\n",
    "date\n",
    "echo \"------------\"\n",
    "echo \"\"\n",
    "#Display operating system info\n",
    "lsb_release -a\n",
    "echo \"\"\n",
    "echo \"------------\"\n",
    "echo \"HOSTNAME: \"; hostname \n",
    "echo \"\"\n",
    "echo \"------------\"\n",
    "echo \"Computer Specs:\"\n",
    "echo \"\"\n",
    "lscpu\n",
    "echo \"\"\n",
    "echo \"------------\"\n",
    "echo \"\"\n",
    "echo \"Memory Specs\"\n",
    "echo \"\"\n",
    "free -mh"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set variables\n",
    "`%env` variables are good for passing to bash cells"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "env: wd=/home/sam/Downloads/20200311_swoose_pgen_genome_ena_submission\n",
      "env: embl_out=Panopea-generosa-v1.0.a4.embl\n",
      "env: combined_gff=Panopea-generosa-v1.0.a4.combined.gff\n",
      "env: fasta=Panopea-generosa-v1.0.fa\n",
      "env: pid=PRJEB36299\n",
      "env: locus_tag=PGEN\n",
      "env: species='Panopea generosa'\n"
     ]
    }
   ],
   "source": [
    "# Set workding directory\n",
    "%env wd=/home/sam/Downloads/20200311_swoose_pgen_genome_ena_submission\n",
    "wd=\"/home/sam/Downloads/20200311_swoose_pgen_genome_ena_submission\"\n",
    "\n",
    "# Input output files\n",
    "%env embl_out=Panopea-generosa-v1.0.a4.embl\n",
    "%env combined_gff=Panopea-generosa-v1.0.a4.combined.gff\n",
    "%env fasta=Panopea-generosa-v1.0.fa\n",
    "\n",
    "# ENA-specific info\n",
    "%env pid=PRJEB36299\n",
    "%env locus_tag=PGEN\n",
    "\n",
    "%env species='Panopea generosa'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/sam/Downloads/20200311_swoose_pgen_genome_ena_submission\n"
     ]
    }
   ],
   "source": [
    "cd {wd}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### All GFF files were downloaded from here: https://osf.io/yem8n/files/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 1.1G\n",
      "-rw-rw-r-- 1 sam sam 914M Mar  3 15:08 Panopea-generosa-v1.0.fa\n",
      "-rw-rw-r-- 1 sam sam  217 Mar  3 15:09 pgen_v074_ena_submission_manifest.tab\n",
      "-rw-rw-r-- 1 sam sam  658 Mar  3 21:08 Panopea-generosa-v1.0.fa.fai\n",
      "-rw-rw-r-- 1 sam sam  53M Mar  4 06:31 Panopea-generosa-v1.0.a4.CDS.gff3\n",
      "-rw-rw-r-- 1 sam sam  55M Mar  4 06:32 Panopea-generosa-v1.0.a4.exon.gff3\n",
      "-rw-rw-r-- 1 sam sam 9.1M Mar  4 06:32 Panopea-generosa-v1.0.a4.mRNA.gff3\n",
      "-rw-rw-r-- 1 sam sam 1.5K Mar  4 06:32 Panopea-generosa-v1.0.a4.rRNA.gff3\n",
      "-rw-rw-r-- 1 sam sam 2.6M Mar  4 06:32 Panopea-generosa-v1.0.a4.tRNA.gff3\n",
      "-rw-rw-r-- 1 sam sam 5.4M Mar  4 06:32 Panopea-generosa-v1.0.a4.repeats.DNA.gff3\n",
      "-rw-rw-r-- 1 sam sam  18M Mar  4 06:33 Panopea-generosa-v1.0.a4.repeats.LINE.gff3\n",
      "-rw-rw-r-- 1 sam sam 763K Mar  4 06:33 Panopea-generosa-v1.0.a4.repeats.LTR.gff3\n",
      "-rw-rw-r-- 1 sam sam 143K Mar  4 06:33 Panopea-generosa-v1.0.a4.repeats.RC.gff3\n",
      "-rw-rw-r-- 1 sam sam 4.6M Mar  4 06:33 Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3\n",
      "-rw-rw-r-- 1 sam sam  11M Mar  4 06:33 Panopea-generosa-v1.0.a4.repeats.SINE.gff3\n",
      "-rw-rw-r-- 1 sam sam 9.6M Mar  4 14:51 Panopea-generosa-v1.0.a4.gene.gff3\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "ls -ltrh"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Concatenate GFFs to create singular GFF\n",
    "\n",
    "This step also changes the text \"notes=\" to \"note=\". This will allow that section of the GFF to be included in the output EMBL flat file. The info in this section includes information on SwissProt, InterProScan, and Pfam accession numbers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GFF line counts:\n",
      "\n",
      "Panopea-generosa-v1.0.a4.CDS.gff3\n",
      "236960\n",
      "Panopea-generosa-v1.0.a4.exon.gff3\n",
      "236960\n",
      "Panopea-generosa-v1.0.a4.gene.gff3\n",
      "34947\n",
      "Panopea-generosa-v1.0.a4.mRNA.gff3\n",
      "38326\n",
      "Panopea-generosa-v1.0.a4.repeat_region.gff3\n",
      "1676544\n",
      "Panopea-generosa-v1.0.a4.repeats.DNA.gff3\n",
      "23195\n",
      "Panopea-generosa-v1.0.a4.repeats.LINE.gff3\n",
      "75939\n",
      "Panopea-generosa-v1.0.a4.repeats.LTR.gff3\n",
      "3255\n",
      "Panopea-generosa-v1.0.a4.repeats.RC.gff3\n",
      "603\n",
      "Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3\n",
      "19865\n",
      "Panopea-generosa-v1.0.a4.repeats.SINE.gff3\n",
      "43129\n",
      "Panopea-generosa-v1.0.a4.repeats.Unknown.gff3\n",
      "1510558\n",
      "Panopea-generosa-v1.0.a4.rRNA.gff3\n",
      "8\n",
      "Panopea-generosa-v1.0.a4.tRNA.gff3\n",
      "16889\n",
      "\n",
      "-------------------------------\n",
      "\n",
      "Panopea-generosa-v1.0.a4.combined.gff line count:\n",
      "3917178\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "# Copy GFF3 header to new GFF3\n",
    "awk 'NR < 4' Panopea-generosa-v1.0.a4.CDS.gff3 > ${combined_gff}\n",
    "\n",
    "echo \"GFF line counts:\"\n",
    "echo \"\"\n",
    "\n",
    "# Concatenate all GFFs\n",
    "for gff in *.gff3\n",
    "do\n",
    "  # Print file name and line count (excluding header)\n",
    "  echo \"${gff}\"\n",
    "  awk 'NR > 3' ${gff} | wc -l\n",
    "\n",
    "  # Skip header and substitute \"notes=\" with \"note=\"\n",
    "  awk 'NR > 3' ${gff} | sed 's/notes=/note=/g' >> ${combined_gff}\n",
    "done\n",
    "\n",
    "echo \"\"\n",
    "echo \"-------------------------------\"\n",
    "echo \"\"\n",
    "\n",
    "echo \"${combined_gff} line count:\"\n",
    "awk 'NR > 3' ${combined_gff} | wc -l"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Display program options"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "usage: EMBLmyGFF3 [-h] [-a] [-c CREATED]\n",
      "                  [-d {CON,PAT,EST,GSS,HTC,HTG,MGA,WGS,TSA,STS,STD}]\n",
      "                  [-g ORGANELLE] [-i LOCUS_TAG] [-k KEYWORD [KEYWORD ...]]\n",
      "                  [-l CLASSIFICATION]\n",
      "                  [-m {genomic DNA,genomic RNA,mRNA,tRNA,rRNA,other RNA,other DNA,transcribed RNA,viral cRNA,unassigned DNA,unassigned RNA}]\n",
      "                  [-o OUTPUT] [-p PROJECT_ID] [-q]\n",
      "                  [-r {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}]\n",
      "                  [-s SPECIES] [-t {linear,circular}] [-v]\n",
      "                  [-x {PHG,ENV,FUN,HUM,INV,MAM,VRT,MUS,PLN,PRO,ROD,SYN,TGN,UNC,VRL}]\n",
      "                  [-z] [--ah {One of the parameters above}] [--de DE]\n",
      "                  [--ra RA [RA ...]] [--rc RC] [--rg RG] [--rl RL] [--rt RT]\n",
      "                  [--rx RX] [--email EMAIL] [--expose_translations]\n",
      "                  [--force_unknown_features] [--force_uncomplete_features]\n",
      "                  [--interleave_genes] [--keep_duplicates]\n",
      "                  [--locus_numbering_start LOCUS_NUMBERING_START]\n",
      "                  [--no_progress] [--no_wrap_qualifier] [--shame]\n",
      "                  [--translate]\n",
      "                  [--use_attribute_value_as_locus_tag USE_ATTRIBUTE_VALUE_AS_LOCUS_TAG]\n",
      "                  [--uncompressed_log] [--version VERSION] [--strain STRAIN]\n",
      "                  [--environmental_sample]\n",
      "                  [--isolation_source ISOLATION_SOURCE] [--isolate ISOLATE]\n",
      "                  gff_file fasta\n",
      "\n",
      "EMBL writer for ENA data submission. Note that this implementation is\n",
      "basically just the documentation at\n",
      "ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt in python form - the\n",
      "implementation could be a lot more efficient! GFF convertion is based on\n",
      "specifications from https://github.com/The-Sequence-\n",
      "Ontology/Specifications/blob/master/gff3.md\n",
      "\n",
      "positional arguments:\n",
      "  gff_file              Input gff-file.\n",
      "  fasta                 Input fasta sequence.\n",
      "\n",
      "optional arguments:\n",
      "  -h, --help            show this help message and exit\n",
      "  -a, --accession       Bolean. Accession number(s) for the entry. Default\n",
      "                        value: XXX. The proper value is automatically filled\n",
      "                        up by ENA during the submission by a unique accession\n",
      "                        number they will assign. The accession number is used\n",
      "                        to set up the AC line and the first token of the ID\n",
      "                        line as well. Please visit [this\n",
      "                        page](https://www.ebi.ac.uk/ena/submit/accession-\n",
      "                        number-formats) and [this\n",
      "                        one](https://www.ebi.ac.uk/ena/submit/sequence-\n",
      "                        submission) to learn more about it. Activating the\n",
      "                        option will set the Accession number with the fasta\n",
      "                        sequence identifier.\n",
      "  -c CREATED, --created CREATED\n",
      "                        Creation time of the original entry. The default value\n",
      "                        is the date of the day.\n",
      "  -d {CON,PAT,EST,GSS,HTC,HTG,MGA,WGS,TSA,STS,STD}, --data_class {CON,PAT,EST,GSS,HTC,HTG,MGA,WGS,TSA,STS,STD}\n",
      "                        Data class of the sample. Default value 'XXX'. This\n",
      "                        option is used to set up the 5th token of the ID line.\n",
      "  -g ORGANELLE, --organelle ORGANELLE\n",
      "                        Sample organelle. No default value.\n",
      "  -i LOCUS_TAG, --locus_tag LOCUS_TAG\n",
      "                        Locus tag prefix used to set up the prefix of the\n",
      "                        locus_tag qualifier. The locus tag has to be\n",
      "                        registered at ENA prior any submission. More\n",
      "                        information [here](https://www.ebi.ac.uk/ena/submit\n",
      "                        /locus-tags).\n",
      "  -k KEYWORD [KEYWORD ...], --keyword KEYWORD [KEYWORD ...]\n",
      "                        Keywords for the entry. No default value.\n",
      "  -l CLASSIFICATION, --classification CLASSIFICATION\n",
      "                        Organism classification e.g 'Eukaryota; Opisthokonta;\n",
      "                        Metazoa'. The default value is the classification\n",
      "                        found in the NCBI taxonomy DB from the species/taxid\n",
      "                        given as --species parameter. If none is found, 'Life'\n",
      "                        will be the default value.\n",
      "  -m {genomic DNA,genomic RNA,mRNA,tRNA,rRNA,other RNA,other DNA,transcribed RNA,viral cRNA,unassigned DNA,unassigned RNA}, --molecule_type {genomic DNA,genomic RNA,mRNA,tRNA,rRNA,other RNA,other DNA,transcribed RNA,viral cRNA,unassigned DNA,unassigned RNA}\n",
      "                        Molecule type of the sample. No default value.\n",
      "  -o OUTPUT, --output OUTPUT\n",
      "                        Output filename.\n",
      "  -p PROJECT_ID, --project_id PROJECT_ID\n",
      "                        Project ID. Default is 'XXX' (This is used to set up\n",
      "                        the PR line).\n",
      "  -q, --quiet           Decrease verbosity.\n",
      "  -r {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}, --transl_table {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}\n",
      "                        Translation table. No default. (This is used to set up\n",
      "                        the translation table qualifier transl_table of the\n",
      "                        CDS features.) Please visit [NCBI genetic code](https:\n",
      "                        //www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi)\n",
      "                        for more information.\n",
      "  -s SPECIES, --species SPECIES\n",
      "                        Sample species, formatted as 'Genus species' or taxid.\n",
      "                        No default. (This is used to set up the OS line.)\n",
      "  -t {linear,circular}, --topology {linear,circular}\n",
      "                        Sequence topology. No default. (This is used to set up\n",
      "                        the Topology that is the 3rd token of the ID line.)\n",
      "  -v, --verbose         Increase verbosity.\n",
      "  -x {PHG,ENV,FUN,HUM,INV,MAM,VRT,MUS,PLN,PRO,ROD,SYN,TGN,UNC,VRL}, --taxonomy {PHG,ENV,FUN,HUM,INV,MAM,VRT,MUS,PLN,PRO,ROD,SYN,TGN,UNC,VRL}\n",
      "                        Source taxonomy. Default value 'XXX'. This option is\n",
      "                        used to set the taxonomic division within ID line (6th\n",
      "                        token).\n",
      "  -z, --gzip            Gzip output file.\n",
      "  --ah {One of the parameters above}, --advanced_help {One of the parameters above}\n",
      "                        Display advanced information of the parameter\n",
      "                        specified or of all parameters if none specified.\n",
      "  --de DE               Description. Default value 'XXX'.\n",
      "  --ra RA [RA ...], --author RA [RA ...]\n",
      "                        Author for the reference. No default value.\n",
      "  --rc RC               Reference Comment. No default value.\n",
      "  --rg RG               Reference Group, the working groups/consortia that\n",
      "                        produced the record. Default value 'XXX'.\n",
      "  --rl RL               Reference publishing location. No default value.\n",
      "  --rt RT               Reference Title. No default value.\n",
      "  --rx RX               Reference cross-reference. No default value\n",
      "  --email EMAIL         Email used to fetch information from NCBI taxonomy\n",
      "                        database. Default value 'EMBLmyGFF3@tool.org'.\n",
      "  --expose_translations\n",
      "                        Copy feature and attribute mapping files to the\n",
      "                        working directory. They will be used as mapping files\n",
      "                        instead of the default internal JSON files. You may\n",
      "                        modify them as it suits you.\n",
      "  --force_unknown_features\n",
      "                        Force to keep feature types not accepted by EMBL. /!\\\n",
      "                        Option not suitable for submission purpose.\n",
      "  --force_uncomplete_features\n",
      "                        Force to keep features whithout all the mandatory\n",
      "                        qualifiers. /!\\ Option not suitable for submission\n",
      "                        purpose.\n",
      "  --interleave_genes    Print gene features with interleaved mRNA and CDS\n",
      "                        features.\n",
      "  --keep_duplicates     Do not remove duplicate features during the process.\n",
      "                        /!\\ Option not suitable for submission purpose.\n",
      "  --locus_numbering_start LOCUS_NUMBERING_START\n",
      "                        Start locus numbering with the provided value.\n",
      "  --no_progress         Hide conversion progress counter.\n",
      "  --no_wrap_qualifier   By default there is a line wrapping at 80 characters.\n",
      "                        The cut is at the world level. Activating this option\n",
      "                        will avoid the line-wrapping for the qualifiers.\n",
      "  --shame               Suppress the shameless plug.\n",
      "  --translate           Include translation in CDS features.\n",
      "  --use_attribute_value_as_locus_tag USE_ATTRIBUTE_VALUE_AS_LOCUS_TAG\n",
      "                        Use the value of the defined attribute as locus_tag.\n",
      "  --uncompressed_log    Some logs can be compressed for better lisibility,\n",
      "                        they won't.\n",
      "  --version VERSION     Sequence version number. The default value is 1.\n",
      "  --strain STRAIN       Strain from which sequence was obtained. May be needed\n",
      "                        when organism belongs to Bacteria.\n",
      "  --environmental_sample\n",
      "                        Bolean. Identifies sequences derived by direct\n",
      "                        molecular isolation from a bulk environmental DNA\n",
      "                        sample with no reliable identification of the source\n",
      "                        organism. May be needed when organism belongs to\n",
      "                        Bacteria.\n",
      "  --isolation_source ISOLATION_SOURCE\n",
      "                        Describes the physical, environmental and/or local\n",
      "                        geographical source of the biological sample from\n",
      "                        which the sequence was derived. Mandatory when\n",
      "                        environmental_sample option used.\n",
      "  --isolate ISOLATE     Individual isolate from which the sequence was\n",
      "                        obtained. May be needed when organism belongs to\n",
      "                        Bacteria.\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "EMBLmyGFF3 -h"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Convert combined GFF to EMBL flat file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "real\t174m10.906s\n",
      "user\t159m20.296s\n",
      "sys\t14m43.564s\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "time \\\n",
    "EMBLmyGFF3 ${combined_gff} \\\n",
    "${fasta} \\\n",
    "--topology linear \\\n",
    "--molecule_type 'genomic DNA' \\\n",
    "--transl_table 1 \\\n",
    "--species 'Panopea generosa' \\\n",
    "--locus_tag ${locus_tag} \\\n",
    "--project_id ${pid} \\\n",
    "--output ${embl_out} \\\n",
    "&> emblymygff3.log"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ID   XXX; XXX; linear; genomic DNA; XXX; XXX; 89643857 BP.\n",
      "XX\n",
      "AC   XXX; \n",
      "XX\n",
      "AC * _Scaffold_01\n",
      "XX\n",
      "PR   Project:PRJEB36299;\n",
      "XX\n",
      "DT   12-Mar-2020 (Rel. 133, Created)\n",
      "XX\n",
      "DE   XXX\n",
      "XX\n",
      "KW   .\n",
      "XX\n",
      "OS   Panopea generosa\n",
      "XX\n",
      "OC   cellular organisms; Eukaryota; Opisthokonta; Metazoa; Eumetazoa; Bilateria;\n",
      "OC   Protostomia; Spiralia; Lophotrochozoa; Mollusca; Bivalvia; Heterodonta;\n",
      "OC   Euheterodonta; Myoida; Hiatelloidea; Hiatellidae; Panopea.\n",
      "XX\n",
      "RN   [1]\n",
      "RP   1-89643857\n",
      "RG   XXX\n",
      "RT   ;\n",
      "RL   Submitted (12-MAR-2020) to the INSDC.\n",
      "XX\n",
      "FH   Key             Location/Qualifiers\n",
      "FH\n",
      "FT   source          1..89643857\n",
      "FT                   /mol_type=\"genomic DNA\"\n",
      "FT                   /organism=\"Panopea generosa\"\n",
      "FT   gene            2..4719\n",
      "FT                   /locus_tag=\"PGEN_LOCUS1\"\n",
      "FT                   /note=\"source:GenSAS_5d9637f372b5d-publish\"\n",
      "FT                   /note=\"ID:PGEN_.00g000010\"\n",
      "FT                   /standard_name=\"PGEN_.00g000010\"\n",
      "FT   mRNA            join(2..125,1995..2095,3325..3495,4651..4719)\n",
      "FT                   /locus_tag=\"PGEN_LOCUS1\"\n",
      "FT                   /note=\"source:GenSAS_5d9637f372b5d-publish\"\n",
      "FT                   /note=\"ID:PGEN_.00g000010.m01\"\n",
      "FT                   /standard_name=\"PGEN_.00g000010.m01\"\n",
      "FT   CDS             join(<2..125,1995..2095,3325..3495,4651..4719)\n",
      "FT                   /standard_name=\"PGEN_.00g000010.m01.CDS01\"\n",
      "FT                   /standard_name=\"PGEN_.00g000010.m01.CDS02\"\n",
      "FT                   /standard_name=\"PGEN_.00g000010.m01.CDS03\"\n",
      "FT                   /standard_name=\"PGEN_.00g000010.m01.CDS04\"\n",
      "FT                   /locus_tag=\"PGEN_LOCUS1\"\n",
      "FT                   /codon_start=1\n",
      "FT                   /note=\"source:GenSAS_5d9637f372b5d-publish\"\n",
      "FT                   /note=\"ID:PGEN_.00g000010.m01.CDS01\"\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "head -n 50 ${embl_out}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 5.7G\n",
      "-rw-rw-r-- 1 sam sam 914M Mar  3 15:08 Panopea-generosa-v1.0.fa\n",
      "-rw-rw-r-- 1 sam sam  658 Mar  3 21:08 Panopea-generosa-v1.0.fa.fai\n",
      "-rw-rw-r-- 1 sam sam  215 Mar 12 15:59 pgen_v074_ena_submission_manifest.tab\n",
      "-rw-rw-r-- 1 sam sam 844M Mar 12 17:11 Panopea-generosa-v1.0.a4.combined.gff\n",
      "-rw-rw-r-- 1 sam sam 2.3G Mar 12 22:25 Panopea-generosa-v1.0.a4.embl\n",
      "-rw-rw-r-- 1 sam sam 805M Mar 12 22:25 emblymygff3.log\n",
      "-rw------- 1 sam sam 322M Mar 12 23:56 Panopea-generosa-v1.0.a4.repeats.Unknown.gff3\n",
      "-rw------- 1 sam sam 358M Mar 12 23:56 Panopea-generosa-v1.0.a4.repeat_region.gff3\n",
      "-rw------- 1 sam sam  53M Mar 12 23:57 Panopea-generosa-v1.0.a4.CDS.gff3\n",
      "-rw------- 1 sam sam  55M Mar 12 23:57 Panopea-generosa-v1.0.a4.exon.gff3\n",
      "-rw------- 1 sam sam 9.1M Mar 12 23:57 Panopea-generosa-v1.0.a4.mRNA.gff3\n",
      "-rw------- 1 sam sam 9.5M Mar 12 23:57 Panopea-generosa-v1.0.a4.gene.gff3\n",
      "-rw------- 1 sam sam 2.5M Mar 12 23:57 Panopea-generosa-v1.0.a4.tRNA.gff3\n",
      "-rw------- 1 sam sam  17M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.LINE.gff3\n",
      "-rw------- 1 sam sam 1.4K Mar 12 23:57 Panopea-generosa-v1.0.a4.rRNA.gff3\n",
      "-rw------- 1 sam sam 9.7M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.SINE.gff3\n",
      "-rw------- 1 sam sam 4.4M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3\n",
      "-rw------- 1 sam sam 726K Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.LTR.gff3\n",
      "-rw------- 1 sam sam 5.2M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.DNA.gff3\n",
      "-rw------- 1 sam sam 136K Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.RC.gff3\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "ls -ltrh"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Mainfest file required for submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "STUDY   PRJEB36299\n",
      "SAMPLE   ERS4366161\n",
      "ASSEMBLYNAME   PGEN-v1.0\n",
      "ASSEMBLY_TYPE isolate\n",
      "COVERAGE   86\n",
      "PROGRAM   Proximo\n",
      "PLATFORM   Hi-C,HiSeq4000\n",
      "MOLECULETYPE   genomic DNA\n",
      "FLATFILE   Panopea-generosa-v1.0.a4.embl.gz\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "cat pgen_v074_ena_submission_manifest.tab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "STUDY   PRJEB36299\n",
      "SAMPLE   ERS4366161\n",
      "ASSEMBLYNAME   PGEN-v1.0\n",
      "ASSEMBLY_TYPE isolate\n",
      "COVERAGE   86\n",
      "PROGRAM   Proximo\n",
      "PLATFORM   Hi-C,HiSeq4000\n",
      "MOLECULETYPE   genomic DNA\n",
      "FLATFILE   Panopea-generosa-v1.0.a4.embl.gz\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "cat pgen_v074_ena_submission_manifest.tab"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### ENA requires gzipped or bzipped files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 6.1G\n",
      "-rw-rw-r-- 1 sam sam 914M Mar  3 15:08 Panopea-generosa-v1.0.fa\n",
      "-rw-rw-r-- 1 sam sam  658 Mar  3 21:08 Panopea-generosa-v1.0.fa.fai\n",
      "-rw-rw-r-- 1 sam sam  215 Mar 12 15:59 pgen_v074_ena_submission_manifest.tab\n",
      "-rw-rw-r-- 1 sam sam 844M Mar 12 17:11 Panopea-generosa-v1.0.a4.combined.gff\n",
      "-rw-rw-r-- 1 sam sam 397M Mar 12 22:25 Panopea-generosa-v1.0.a4.embl.gz\n",
      "-rw-rw-r-- 1 sam sam 2.3G Mar 12 22:25 Panopea-generosa-v1.0.a4.embl\n",
      "-rw-rw-r-- 1 sam sam 805M Mar 12 22:25 emblymygff3.log\n",
      "-rw------- 1 sam sam 322M Mar 12 23:56 Panopea-generosa-v1.0.a4.repeats.Unknown.gff3\n",
      "-rw------- 1 sam sam 358M Mar 12 23:56 Panopea-generosa-v1.0.a4.repeat_region.gff3\n",
      "-rw------- 1 sam sam  53M Mar 12 23:57 Panopea-generosa-v1.0.a4.CDS.gff3\n",
      "-rw------- 1 sam sam  55M Mar 12 23:57 Panopea-generosa-v1.0.a4.exon.gff3\n",
      "-rw------- 1 sam sam 9.1M Mar 12 23:57 Panopea-generosa-v1.0.a4.mRNA.gff3\n",
      "-rw------- 1 sam sam 9.5M Mar 12 23:57 Panopea-generosa-v1.0.a4.gene.gff3\n",
      "-rw------- 1 sam sam 2.5M Mar 12 23:57 Panopea-generosa-v1.0.a4.tRNA.gff3\n",
      "-rw------- 1 sam sam  17M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.LINE.gff3\n",
      "-rw------- 1 sam sam 1.4K Mar 12 23:57 Panopea-generosa-v1.0.a4.rRNA.gff3\n",
      "-rw------- 1 sam sam 9.7M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.SINE.gff3\n",
      "-rw------- 1 sam sam 4.4M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3\n",
      "-rw------- 1 sam sam 726K Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.LTR.gff3\n",
      "-rw------- 1 sam sam 5.2M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.DNA.gff3\n",
      "-rw------- 1 sam sam 136K Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.RC.gff3\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "real\t2m36.904s\n",
      "user\t2m34.020s\n",
      "sys\t0m0.756s\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "time \\\n",
    "gzip --keep Panopea-generosa-v1.0.a4.embl\n",
    "\n",
    "ls -ltrh"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}