{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "fresh-throat",
   "metadata": {},
   "source": [
    "## Verify NCBI _O.lurida_ Genome Submission with GFF Annotations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "alternate-swedish",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TODAY'S DATE:\n",
      "Thu May 13 09:07:32 PDT 2021\n",
      "------------\n",
      "\n",
      "Distributor ID:\tUbuntu\n",
      "Description:\tUbuntu 20.04.2 LTS\n",
      "Release:\t20.04\n",
      "Codename:\tfocal\n",
      "\n",
      "------------\n",
      "HOSTNAME: \n",
      "computer\n",
      "\n",
      "------------\n",
      "Computer Specs:\n",
      "\n",
      "Architecture:                    x86_64\n",
      "CPU op-mode(s):                  32-bit, 64-bit\n",
      "Byte Order:                      Little Endian\n",
      "Address sizes:                   45 bits physical, 48 bits virtual\n",
      "CPU(s):                          8\n",
      "On-line CPU(s) list:             0-7\n",
      "Thread(s) per core:              1\n",
      "Core(s) per socket:              1\n",
      "Socket(s):                       8\n",
      "NUMA node(s):                    1\n",
      "Vendor ID:                       GenuineIntel\n",
      "CPU family:                      6\n",
      "Model:                           165\n",
      "Model name:                      Intel(R) Core(TM) i9-10885H CPU @ 2.40GHz\n",
      "Stepping:                        2\n",
      "CPU MHz:                         2400.001\n",
      "BogoMIPS:                        4800.00\n",
      "Hypervisor vendor:               VMware\n",
      "Virtualization type:             full\n",
      "L1d cache:                       256 KiB\n",
      "L1i cache:                       256 KiB\n",
      "L2 cache:                        2 MiB\n",
      "L3 cache:                        128 MiB\n",
      "NUMA node0 CPU(s):               0-7\n",
      "Vulnerability Itlb multihit:     KVM: Mitigation: VMX unsupported\n",
      "Vulnerability L1tf:              Not affected\n",
      "Vulnerability Mds:               Not affected\n",
      "Vulnerability Meltdown:          Not affected\n",
      "Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\n",
      "Vulnerability Spectre v1:        Mitigation; usercopy/swapgs barriers and __user pointer sanitization\n",
      "Vulnerability Spectre v2:        Mitigation; Enhanced IBRS, IBPB conditional, RSB filling\n",
      "Vulnerability Srbds:             Not affected\n",
      "Vulnerability Tsx async abort:   Not affected\n",
      "Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon nopl xtopology tsc_reliable nonstop_tsc cpuid pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch cpuid_fault invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 avx2 smep bmi2 invpcid rdseed adx smap clflushopt xsaveopt xsavec xgetbv1 xsaves arat pku ospke md_clear flush_l1d arch_capabilities\n",
      "\n",
      "------------\n",
      "\n",
      "Memory Specs\n",
      "\n",
      "              total        used        free      shared  buff/cache   available\n",
      "Mem:           53Gi       1.4Gi        43Gi        89Mi       9.1Gi        51Gi\n",
      "Swap:         2.0Gi          0B       2.0Gi\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No LSB modules are available.\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "echo \"TODAY'S DATE:\"\n",
    "date\n",
    "echo \"------------\"\n",
    "echo \"\"\n",
    "#Display operating system info\n",
    "lsb_release -a\n",
    "echo \"\"\n",
    "echo \"------------\"\n",
    "echo \"HOSTNAME: \"; hostname \n",
    "echo \"\"\n",
    "echo \"------------\"\n",
    "echo \"Computer Specs:\"\n",
    "echo \"\"\n",
    "lscpu\n",
    "echo \"\"\n",
    "echo \"------------\"\n",
    "echo \"\"\n",
    "echo \"Memory Specs\"\n",
    "echo \"\"\n",
    "free -mh"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "narrow-twins",
   "metadata": {},
   "source": [
    "### Set variables\n",
    "\n",
    "- `%env` indicates a bash variable; without `%env` is Python variable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "typical-ratio",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "env: data_dir=/home/samb/data/O_lurida/genomes\n",
      "env: analysis_dir=/home/samb/analyses/20210513_olur_NCBI_genome-submission-prep\n",
      "env: genome_fasta=Olurida_v081.fa\n",
      "env: orig_gff=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff\n",
      "env: new_gff=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added_no-fasta.gff\n",
      "env: sqn=20210513_Olurida-v081.sqn\n",
      "env: table2asn=/home/samb/programs/linux64.table2asn_GFF\n",
      "env: locus_tag=CGC61\n"
     ]
    }
   ],
   "source": [
    "# Set directories, input/output files\n",
    "%env data_dir=/home/samb/data/O_lurida/genomes\n",
    "%env analysis_dir=/home/samb/analyses/20210513_olur_NCBI_genome-submission-prep\n",
    "\n",
    "%env genome_fasta=Olurida_v081.fa\n",
    "%env orig_gff=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff\n",
    "%env new_gff=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added_no-fasta.gff\n",
    "\n",
    "%env sqn=20210513_Olurida-v081.sqn\n",
    "\n",
    "# NCBI verification program\n",
    "%env table2asn=/home/samb/programs/linux64.table2asn_GFF\n",
    "\n",
    "# Locus tag prefix from NCBI BioProject PRJNA393719\n",
    "%env locus_tag=CGC61"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "going-butterfly",
   "metadata": {},
   "source": [
    "### Create output directory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "distant-language",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 0\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "mkdir --parents \"${analysis_dir}\"\n",
    "ls -lh \"${analysis_dir}\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "interim-pacific",
   "metadata": {},
   "source": [
    "### Download FastA and GFF\n",
    "\n",
    "If needing to use URLs:\n",
    "\n",
    "GFF: https://owl.fish.washington.edu/halfshell/genomic-databank/Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff\n",
    "\n",
    "- MD5: `f54512bd964f45645c34b1e8e403a2b0`\n",
    "\n",
    "FastA: http://owl.fish.washington.edu/halfshell/genomic-databank/Olurida_v081.fa\n",
    "\n",
    "- MD5: `3ac56372bd62038f264d27eef0883bd3`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "british-flush",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "receiving incremental file list\n",
      "Olurida_v081.fa\n",
      "\n",
      "sent 30 bytes  received 1,143,221,718 bytes  7,447,698.68 bytes/sec\n",
      "total size is 1,143,082,078  speedup is 1.00\n",
      "receiving incremental file list\n",
      "Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff\n",
      "\n",
      "sent 30 bytes  received 3,105,037,889 bytes  11,478,883.25 bytes/sec\n",
      "total size is 3,104,658,743  speedup is 1.00\n",
      "total 4.0G\n",
      "-rw-rw-rw- 1 samb samb 1.1G Jun  7  2018 Olurida_v081.fa\n",
      "-rw-rw-r-- 1 samb samb 2.9G Dec 13  2019 Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "rsync -avp owl:/volume1/web/halfshell/genomic-databank/Olurida_v081.fa \"${data_dir}\"\n",
    "\n",
    "rsync -avp owl:/volume1/web/halfshell/genomic-databank/Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff \"${data_dir}\"\n",
    "\n",
    "ls -lh \"${data_dir}\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "vietnamese-blake",
   "metadata": {},
   "source": [
    "### Fix GFF formatting\n",
    "\n",
    "GFF is _not_ in a standard format. When [generated by MAKER on 20190709](https://robertslab.github.io/sams-notebook/2019/07/09/Genome-Annotation-Olurida_v081-with-MAKER-and-Tissue-specific-Transcriptomes-on-Mox.html), the setting used included the genome sequences appended to the end of the GFF. These need to be removed."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "placed-huntington",
   "metadata": {},
   "source": [
    "#### Figure out beginning of FastA seqs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "growing-exemption",
   "metadata": {},
   "source": [
    "##### Identify first FastA sequence and look at a few lines before that.\n",
    "\n",
    "Uses the `-n` option to display line numbers before each result."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "unauthorized-switzerland",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "11574856-Contig58125\trepeatmasker\tmatch_part\t19166\t19724\t1427\t+\t.\tID=Contig58125:hsp:53138:1.3.0.0;Parent=Contig58125:hit:25067:1.3.0.0;Target=species:Helitron-N14_CGi|genus:RC%252FHelitron 474 1043 +\n",
      "11574857-Contig58125\trepeatmasker\tmatch\t19780\t19810\t232\t+\t.\tID=Contig58125:hit:25068:1.3.0.0;Name=species:rnd-4_family-195|genus:RC%2FHelitron;Target=species:rnd-4_family-195|genus:RC%2FHelitron 413 443 +\n",
      "11574858-Contig58125\trepeatmasker\tmatch_part\t19780\t19810\t232\t+\t.\tID=Contig58125:hsp:53139:1.3.0.0;Parent=Contig58125:hit:25068:1.3.0.0;Target=species:rnd-4_family-195|genus:RC%252FHelitron 413 443 +\n",
      "11574859-###\n",
      "11574860-##FASTA\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "cd ${data_dir}\n",
    "grep -n -B 5 \"^>\" ${orig_gff} | head -n 5"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "entertaining-flood",
   "metadata": {},
   "source": [
    "#### Extract GFF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "naval-embassy",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Contig58125\trepeatmasker\tmatch_part\t19780\t19810\t232\t+\t.\tID=Contig58125:hsp:53139:1.3.0.0;Parent=Contig58125:hit:25068:1.3.0.0;Target=species:rnd-4_family-195|genus:RC%252FHelitron 413 443 +\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "cd ${data_dir}\n",
    "head -n 11574858 ${orig_gff} > ${new_gff}\n",
    "\n",
    "# Check the new file\n",
    "tail -n 1 ${new_gff}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "verified-difference",
   "metadata": {},
   "source": [
    "#### Generate checksum for new GFF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fancy-judges",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "cd ${data_dir}\n",
    "md5sum ${new_gff} > checksums.md5"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "designing-overview",
   "metadata": {},
   "source": [
    "### Run NCBI verification\n",
    "\n",
    "Options were taken from here (and are the most basic options):\n",
    "\n",
    "https://www.ncbi.nlm.nih.gov/genbank/genomes_gff/\n",
    "\n",
    "See end of notebook for the full help menu with explanations of each/every possible option.\n",
    "\n",
    "Based on experience, there's a _lot_ werror output, so I've redirected stderr to `/dev/null`. Plus, all of that info is described in the resulting output files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "understanding-involvement",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 1.1G\n",
      "-rw-rw-r-- 1 samb samb  47M May 13 13:23 20210513_Olurida-v081.dr\n",
      "-rw-rw-r-- 1 samb samb 928M May 13 13:23 20210513_Olurida-v081.sqn\n",
      "-rw-rw-r-- 1 samb samb  867 May 13 13:23 20210513_Olurida-v081.stats\n",
      "-rw-rw-r-- 1 samb samb  67M May 13 13:23 20210513_Olurida-v081.val\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "real\t3m4.836s\n",
      "user\t3m4.025s\n",
      "sys\t0m0.748s\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "time \\\n",
    "${table2asn} \\\n",
    "-M n \\\n",
    "-J \\\n",
    "-c w \\\n",
    "-euk \\\n",
    "-locus-tag-prefix ${locus_tag} \\\n",
    "-gaps-min 10 \\\n",
    "-l unspecified \\\n",
    "-i ${data_dir}/${genome_fasta} \\\n",
    "-f ${data_dir}/${new_gff} \\\n",
    "-o ${analysis_dir}/${sqn} \\\n",
    "-Z \\\n",
    "2> /dev/null\n",
    "\n",
    "# List files\n",
    "ls -lh ${analysis_dir}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "monetary-decimal",
   "metadata": {},
   "source": [
    "#### Check for errors\n",
    "\n",
    "- `.stats` explanations are here: https://www.ncbi.nlm.nih.gov/genbank/genome_validation/\n",
    "\n",
    "  - Any errors need to be resolved prior to submission.\n",
    "  \n",
    "- `.dr` explanations are here: https://www.ncbi.nlm.nih.gov/genbank/asndisc/#fatal\n",
    "\n",
    "  - FATAL catagories _might_ need to be resolved prior to submission."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "hourly-nirvana",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total messages:\t\t293302\n",
      "\n",
      "=================================================================\n",
      "35926 WARNING-level messages exist\n",
      "\n",
      "SEQ_INST.TerminalGap:\t2\n",
      "SEQ_INST.LeadingX:\t1\n",
      "SEQ_FEAT.NotSpliceConsensusDonor:\t12488\n",
      "SEQ_FEAT.NotSpliceConsensusAcceptor:\t10811\n",
      "SEQ_FEAT.IntervalBeginsOrEndsInGap:\t72\n",
      "SEQ_FEAT.ShortExon:\t1182\n",
      "SEQ_FEAT.PartialProblemNotSpliceConsensus3Prime:\t6154\n",
      "SEQ_FEAT.PartialProblemNotSpliceConsensus5Prime:\t5214\n",
      "SEQ_FEAT.PartialProblem5Prime:\t2\n",
      "\n",
      "=================================================================\n",
      "257376 ERROR-level messages exist\n",
      "\n",
      "SEQ_INST.ShortSeq:\t2\n",
      "SEQ_DESCR.BioSourceMissing:\t26218\n",
      "SEQ_DESCR.NoPubFound:\t1\n",
      "SEQ_DESCR.NoSourceDescriptor:\t1\n",
      "GENERIC.MissingPubRequirement:\t1\n",
      "SEQ_FEAT.IllegalDbXref:\t230426\n",
      "SEQ_FEAT.FeatureBeginsOrEndsInGap:\t1\n",
      "SEQ_FEAT.ShortIntron:\t726\n",
      "\n",
      "=================================================================\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "cat ${analysis_dir}/20210513_Olurida-v081.stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "smart-incidence",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Discrepancy Report Results\n",
      "\n",
      "Summary\n",
      "COUNT_NUCLEOTIDES: 159429 nucleotide Bioseqs are present\n",
      "LONG_NO_ANNOTATION: 46845 bioseqs are longer than 5000nt and have no features\n",
      "NO_ANNOTATION: 133211 bioseqs have no features\n",
      "GAPS: 114914 sequences contain gaps\n",
      "LOW_QUALITY_REGION: 809 sequences contain low quality region\n",
      "FEATURE_COUNT: CDS: 32210 present\n",
      "FEATURE_COUNT: gene: 32210 present\n",
      "FEATURE_COUNT: mRNA: 32210 present\n",
      "PROTEIN_NAMES: All proteins have same name \"hypothetical protein\"\n",
      "BAD_GENE_STRAND: 3 feature locations conflict with gene location strands\n",
      "FATAL: CONTAINED_CDS: 10 coding regions are completely contained in another coding region, but on the opposite strand.\n",
      "FEATURE_LOCATION_CONFLICT: 16978 features have inconsistent gene locations.\n",
      "FATAL: BACTERIAL_JOINED_FEATURES_NO_EXCEPTION: 29207 coding regions with joined locations have no exceptions\n",
      "SHORT_INTRON: 726 introns are shorter than 10 nt\n",
      "FEATURE_LIST: Feature List\n",
      "\n",
      "Detailed Report\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "head -n 20 ${analysis_dir}/20210513_Olurida-v081.dr"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}