{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2949abe4-dfa6-4c4a-a395-a3a1db92b5e7",
   "metadata": {},
   "source": [
    "## Convert _P.generosa_ GFF to GTF\n",
    "\n",
    "### Notebook relies on:\n",
    "\n",
    "- [GffRead](https://github.com/gpertea/gffread)\n",
    "\n",
    "### Addresses [this GitHub Issue](https://github.com/RobertsLab/resources/issues/1411)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ee0ebae6-d54c-4d18-88bd-3d3456a8b1e6",
   "metadata": {},
   "source": [
    "### List computer specs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9f60016c-d6b6-4b6d-86d3-5ee68b55464c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TODAY'S DATE:\n",
      "Tue 01 Mar 2022 10:33:08 AM PST\n",
      "------------\n",
      "\n",
      "Distributor ID:\tUbuntu\n",
      "Description:\tUbuntu 20.04.3 LTS\n",
      "Release:\t20.04\n",
      "Codename:\tfocal\n",
      "\n",
      "------------\n",
      "HOSTNAME: \n",
      "computer\n",
      "\n",
      "------------\n",
      "Computer Specs:\n",
      "\n",
      "Architecture:                    x86_64\n",
      "CPU op-mode(s):                  32-bit, 64-bit\n",
      "Byte Order:                      Little Endian\n",
      "Address sizes:                   45 bits physical, 48 bits virtual\n",
      "CPU(s):                          2\n",
      "On-line CPU(s) list:             0,1\n",
      "Thread(s) per core:              1\n",
      "Core(s) per socket:              1\n",
      "Socket(s):                       2\n",
      "NUMA node(s):                    1\n",
      "Vendor ID:                       GenuineIntel\n",
      "CPU family:                      6\n",
      "Model:                           165\n",
      "Model name:                      Intel(R) Core(TM) i9-10885H CPU @ 2.40GHz\n",
      "Stepping:                        2\n",
      "CPU MHz:                         2400.008\n",
      "BogoMIPS:                        4800.01\n",
      "Hypervisor vendor:               VMware\n",
      "Virtualization type:             full\n",
      "L1d cache:                       64 KiB\n",
      "L1i cache:                       64 KiB\n",
      "L2 cache:                        512 KiB\n",
      "L3 cache:                        32 MiB\n",
      "NUMA node0 CPU(s):               0,1\n",
      "Vulnerability Itlb multihit:     KVM: Mitigation: VMX unsupported\n",
      "Vulnerability L1tf:              Mitigation; PTE Inversion\n",
      "Vulnerability Mds:               Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown\n",
      "Vulnerability Meltdown:          Mitigation; PTI\n",
      "Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\n",
      "Vulnerability Spectre v1:        Mitigation; usercopy/swapgs barriers and __user pointer sanitization\n",
      "Vulnerability Spectre v2:        Mitigation; Full generic retpoline, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling\n",
      "Vulnerability Srbds:             Not affected\n",
      "Vulnerability Tsx async abort:   Not affected\n",
      "Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon nopl xtopology tsc_reliable nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 invpcid rdseed adx smap clflushopt xsaveopt xsavec xgetbv1 xsaves arat flush_l1d arch_capabilities\n",
      "\n",
      "------------\n",
      "\n",
      "Memory Specs\n",
      "\n",
      "              total        used        free      shared  buff/cache   available\n",
      "Mem:           54Gi       5.5Gi        35Gi       177Mi        13Gi        48Gi\n",
      "Swap:         2.0Gi          0B       2.0Gi\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No LSB modules are available.\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "echo \"TODAY'S DATE:\"\n",
    "date\n",
    "echo \"------------\"\n",
    "echo \"\"\n",
    "#Display operating system info\n",
    "lsb_release -a\n",
    "echo \"\"\n",
    "echo \"------------\"\n",
    "echo \"HOSTNAME: \"; hostname \n",
    "echo \"\"\n",
    "echo \"------------\"\n",
    "echo \"Computer Specs:\"\n",
    "echo \"\"\n",
    "lscpu\n",
    "echo \"\"\n",
    "echo \"------------\"\n",
    "echo \"\"\n",
    "echo \"Memory Specs\"\n",
    "echo \"\"\n",
    "free -mh"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19866f68-bfad-4a83-adb5-24e271e29d06",
   "metadata": {},
   "source": [
    "### Set variables\n",
    "- `%env` indicates a bash variable\n",
    "\n",
    "- without `%env` is Python variable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7293bcb0-581c-4ad2-8f1e-09dd98352aaf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "env: data_dir=/home/sam/data/P_generosa/genomes\n",
      "env: analysis_dir=/home/sam/analyses/20220301-pgen-gff_to_gtf\n",
      "env: gff=Panopea-generosa-v1.0.a4.gff3\n",
      "env: url=https://gannet.fish.washington.edu/Atumefaciens/20191105_swoose_pgen_v074_renaming\n",
      "env: gtf=Panopea-generosa-v1.0.a4.gtf\n",
      "env: gffread=/home/sam/programs/gffread-0.12.7.Linux_x86_64/gffread\n"
     ]
    }
   ],
   "source": [
    "# Set directories, input/output files\n",
    "%env data_dir=/home/sam/data/P_generosa/genomes\n",
    "%env analysis_dir=/home/sam/analyses/20220301-pgen-gff_to_gtf\n",
    "analysis_dir=\"20220301-pgen-gff_to_gtf\"\n",
    "\n",
    "# Input files (from NCBI)\n",
    "%env gff=Panopea-generosa-v1.0.a4.gff3\n",
    "\n",
    "# URL to download files from NCBI\n",
    "%env url=https://gannet.fish.washington.edu/Atumefaciens/20191105_swoose_pgen_v074_renaming\n",
    "\n",
    "# Output file(s)\n",
    "%env gtf=Panopea-generosa-v1.0.a4.gtf\n",
    "\n",
    "\n",
    "# Set program locations\n",
    "%env gffread=/home/sam/programs/gffread-0.12.7.Linux_x86_64/gffread"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7f204c16-2d1f-4837-93b0-1fb0e3d00d64",
   "metadata": {},
   "source": [
    "### Create analysis directory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8f275e34-c56e-4754-abf7-3279667434bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "# Make analysis and data directory, if doesn't exist\n",
    "mkdir --parents \"${analysis_dir}\"\n",
    "\n",
    "mkdir --parents \"${data_dir}\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "56052f6d-441a-4048-8a6f-39d58552283d",
   "metadata": {},
   "source": [
    "### Download GFF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "951fc8e9-b821-4f54-848f-f9573daadc83",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-rw-rw-r-- 1 sam sam 454M Mar 19 07:58 Panopea-generosa-v1.0.a4.gff3\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "cd \"${data_dir}\"\n",
    "\n",
    "# Download with wget.\n",
    "# Use --quiet option to prevent wget output from printing too many lines to notebook\n",
    "# Use --continue to prevent re-downloading fie if it's already been downloaded.\n",
    "# Use --no-check-certificate to avoid download error from gannet\n",
    "wget --quiet \\\n",
    "--continue \\\n",
    "--no-check-certificate \\\n",
    "${url}/${gff}\n",
    "\n",
    "ls -ltrh \"${gff}\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7eb8b7c0-5927-44c5-ba79-cbee1d5a77fb",
   "metadata": {},
   "source": [
    "### Examine GFF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "18862291-d1ec-4b62-8b22-959404538a7f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "##gff-version 3\n",
      "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n",
      "##Project Name : Pgenerosa_v074\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tgene\t2\t4719\t.\t+\t.\tID=PGEN_.00g000010;Name=PGEN_.00g000010;original_ID=21510-PGEN_.00g234140;Alias=21510-PGEN_.00g234140;original_name=21510-PGEN_.00g234140;Notes=sp|Q86IC9|CAMT1_DICDI [BLAST protein vs protein (blastp) 2.7.1],PF01596.12 [Pfam 1.6]\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tmRNA\t2\t4719\t.\t+\t.\tID=PGEN_.00g000010.m01;Name=PGEN_.00g000010.m01;Parent=PGEN_.00g000010;original_ID=21510-PGEN_.00g234140.m01;Alias=21510-PGEN_.00g234140.m01;original_name=21510-PGEN_.00g234140\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t2\t125\t.\t+\t.\tID=PGEN_.00g000010.m01.exon01;Name=PGEN_.00g000010.m01.exon01;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon1;Alias=21510-PGEN_.00g234140.m01.exon1\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t2\t125\t.\t+\t0\tID=PGEN_.00g000010.m01.CDS01;Name=PGEN_.00g000010.m01.CDS01;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t1995\t2095\t.\t+\t.\tID=PGEN_.00g000010.m01.exon02;Name=PGEN_.00g000010.m01.exon02;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon2;Alias=21510-PGEN_.00g234140.m01.exon2\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t1995\t2095\t.\t+\t1\tID=PGEN_.00g000010.m01.CDS02;Name=PGEN_.00g000010.m01.CDS02;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t3325\t3495\t.\t+\t.\tID=PGEN_.00g000010.m01.exon03;Name=PGEN_.00g000010.m01.exon03;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon3;Alias=21510-PGEN_.00g234140.m01.exon3\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t3325\t3495\t.\t+\t0\tID=PGEN_.00g000010.m01.CDS03;Name=PGEN_.00g000010.m01.CDS03;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t4651\t4719\t.\t+\t.\tID=PGEN_.00g000010.m01.exon04;Name=PGEN_.00g000010.m01.exon04;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon4;Alias=21510-PGEN_.00g234140.m01.exon4\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t4651\t4719\t.\t+\t0\tID=PGEN_.00g000010.m01.CDS04;Name=PGEN_.00g000010.m01.CDS04;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tgene\t19808\t36739\t.\t-\t.\tID=PGEN_.00g000020;Name=PGEN_.00g000020;original_ID=21510-PGEN_.00g234150;Alias=21510-PGEN_.00g234150;original_name=21510-PGEN_.00g234150;Notes=sp|P04177|TY3H_RAT [BLAST protein vs protein (blastp) 2.7.1],sp|P04177|TY3H_RAT [DIAMOND Functional 0.9.22],IPR036951 [InterProScan 5.29-68.0],PF00351.16 [Pfam 1.6]\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tmRNA\t19808\t36739\t.\t-\t.\tID=PGEN_.00g000020.m01;Name=PGEN_.00g000020.m01;Parent=PGEN_.00g000020;original_ID=21510-PGEN_.00g234150.m01;Alias=21510-PGEN_.00g234150.m01;original_name=21510-PGEN_.00g234150\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t19808\t19943\t.\t-\t2\tID=PGEN_.00g000020.m01.CDS01;Name=PGEN_.00g000020.m01.CDS01;Parent=PGEN_.00g000020.m01;original_ID=cds.21510-PGEN_.00g234150.m01;Alias=cds.21510-PGEN_.00g234150.m01\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t19808\t19943\t.\t-\t.\tID=PGEN_.00g000020.m01.exon01;Name=PGEN_.00g000020.m01.exon01;Parent=PGEN_.00g000020.m01;original_ID=21510-PGEN_.00g234150.m01.exon10;Alias=21510-PGEN_.00g234150.m01.exon10\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t21133\t21362\t.\t-\t0\tID=PGEN_.00g000020.m01.CDS02;Name=PGEN_.00g000020.m01.CDS02;Parent=PGEN_.00g000020.m01;original_ID=cds.21510-PGEN_.00g234150.m01;Alias=cds.21510-PGEN_.00g234150.m01\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t21133\t21362\t.\t-\t.\tID=PGEN_.00g000020.m01.exon02;Name=PGEN_.00g000020.m01.exon02;Parent=PGEN_.00g000020.m01;original_ID=21510-PGEN_.00g234150.m01.exon9;Alias=21510-PGEN_.00g234150.m01.exon9\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t22487\t22613\t.\t-\t.\tID=PGEN_.00g000020.m01.exon03;Name=PGEN_.00g000020.m01.exon03;Parent=PGEN_.00g000020.m01;original_ID=21510-PGEN_.00g234150.m01.exon8;Alias=21510-PGEN_.00g234150.m01.exon8\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "head -n 20 \"${data_dir}\"/\"${gff}\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b1598092-a1d5-4c8d-ab1c-3bd9d8fcc73a",
   "metadata": {},
   "source": [
    "### Convert GFF to GTF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9cf5e7d6-c5eb-4bc5-af4f-b2955c4de478",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "cd \"${data_dir}\"\n",
    "\n",
    "${gffread} -E \\\n",
    "${data_dir}/\"${gff}\" -T \\\n",
    "1> ${analysis_dir}/\"${gtf}\" \\\n",
    "2> ${analysis_dir}/gffread-gff_to_gtf.stderr"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e50df042-59c0-4327-b3a4-f14399eba05f",
   "metadata": {},
   "source": [
    "### Inspect GTF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "98a870cf-518f-44e5-8c7a-eb32c219ea68",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\ttranscript\t2\t4719\t.\t+\t.\ttranscript_id \"PGEN_.00g000010.m01\"; gene_id \"PGEN_.00g000010\"\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t2\t125\t.\t+\t.\ttranscript_id \"PGEN_.00g000010.m01\"; gene_id \"PGEN_.00g000010\";\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t1995\t2095\t.\t+\t.\ttranscript_id \"PGEN_.00g000010.m01\"; gene_id \"PGEN_.00g000010\";\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t3325\t3495\t.\t+\t.\ttranscript_id \"PGEN_.00g000010.m01\"; gene_id \"PGEN_.00g000010\";\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t4651\t4719\t.\t+\t.\ttranscript_id \"PGEN_.00g000010.m01\"; gene_id \"PGEN_.00g000010\";\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t2\t125\t.\t+\t0\ttranscript_id \"PGEN_.00g000010.m01\"; gene_id \"PGEN_.00g000010\";\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t1995\t2095\t.\t+\t2\ttranscript_id \"PGEN_.00g000010.m01\"; gene_id \"PGEN_.00g000010\";\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t3325\t3495\t.\t+\t0\ttranscript_id \"PGEN_.00g000010.m01\"; gene_id \"PGEN_.00g000010\";\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t4651\t4719\t.\t+\t0\ttranscript_id \"PGEN_.00g000010.m01\"; gene_id \"PGEN_.00g000010\";\n",
      "Scaffold_01\tGenSAS_5d9637f372b5d-publish\ttranscript\t19808\t36739\t.\t-\t.\ttranscript_id \"PGEN_.00g000020.m01\"; gene_id \"PGEN_.00g000020\"\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "head ${analysis_dir}/\"${gtf}\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "26445ae7-6472-4814-bde2-33c646ffcf22",
   "metadata": {},
   "source": [
    "### Generate checksum(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9a8e9dcb-eb74-4af4-8924-0553fe6f3596",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "c50d7afa83901c90e48fc53a024b51f8  gffread-gff_to_gtf.stderr\n",
      "a11a9eb85db1b4bed8df58658f6c0ecb  Panopea-generosa-v1.0.a4.gtf\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "cd \"${analysis_dir}\"\n",
    "\n",
    "for file in *\n",
    "do\n",
    "  md5sum \"${file}\" | tee --append checksums.md5\n",
    "done"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3eed5b68-470e-4b9a-af97-9ba2f27a51b5",
   "metadata": {},
   "source": [
    "### Document GffRead program options"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "61be6360-3e83-4cd9-a1db-4554995b8771",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "gffread v0.12.7. Usage:\n",
      "gffread [-g <genomic_seqs_fasta> | <dir>] [-s <seq_info.fsize>] \n",
      " [-o <outfile>] [-t <trackname>] [-r [<strand>]<chr>:<start>-<end> [-R]]\n",
      " [--jmatch <chr>:<start>-<end>] [--no-pseudo] \n",
      " [-CTVNJMKQAFPGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>]\n",
      " [-j ][--ids <IDs.lst> | --nids <IDs.lst>] [--attrs <attr-list>] [-i <maxintron>]\n",
      " [--stream] [--bed | --gtf | --tlf] [--table <attrlist>] [--sort-by <ref.lst>]\n",
      " [<input_gff>] \n",
      "\n",
      " Filter, convert or cluster GFF/GTF/BED records, extract the sequence of\n",
      " transcripts (exon or CDS) and more.\n",
      " By default (i.e. without -O) only transcripts are processed, discarding any\n",
      " other non-transcript features. Default output is a simplified GFF3 with only\n",
      " the basic attributes.\n",
      " \n",
      "Options:\n",
      " --ids discard records/transcripts if their IDs are not listed in <IDs.lst>\n",
      " --nids discard records/transcripts if their IDs are listed in <IDs.lst>\n",
      " -i   discard transcripts having an intron larger than <maxintron>\n",
      " -l   discard transcripts shorter than <minlen> bases\n",
      " -r   only show transcripts overlapping coordinate range <start>..<end>\n",
      "      (on chromosome/contig <chr>, strand <strand> if provided)\n",
      " -R   for -r option, discard all transcripts that are not fully \n",
      "      contained within the given range\n",
      " --jmatch only output transcripts matching the given junction\n",
      " -U   discard single-exon transcripts\n",
      " -C   coding only: discard mRNAs that have no CDS features\n",
      " --nc non-coding only: discard mRNAs that have CDS features\n",
      " --ignore-locus : discard locus features and attributes found in the input\n",
      " -A   use the description field from <seq_info.fsize> and add it\n",
      "      as the value for a 'descr' attribute to the GFF record\n",
      " -s   <seq_info.fsize> is a tab-delimited file providing this info\n",
      "      for each of the mapped sequences:\n",
      "      <seq-name> <seq-length> <seq-description>\n",
      "      (useful for -A option with mRNA/EST/protein mappings)\n",
      "Sorting: (by default, chromosomes are kept in the order they were found)\n",
      " --sort-alpha : chromosomes (reference sequences) are sorted alphabetically\n",
      " --sort-by : sort the reference sequences by the order in which their\n",
      "      names are given in the <refseq.lst> file\n",
      "Misc options: \n",
      " -F   keep all GFF attributes (for non-exon features)\n",
      " --keep-exon-attrs : for -F option, do not attempt to reduce redundant\n",
      "      exon/CDS attributes\n",
      " -G   do not keep exon attributes, move them to the transcript feature\n",
      "      (for GFF3 output)\n",
      " --attrs <attr-list> only output the GTF/GFF attributes listed in <attr-list>\n",
      "    which is a comma delimited list of attribute names to\n",
      " --keep-genes : in transcript-only mode (default), also preserve gene records\n",
      " --keep-comments: for GFF3 input/output, try to preserve comments\n",
      " -O   process other non-transcript GFF records (by default non-transcript\n",
      "      records are ignored)\n",
      " -V   discard any mRNAs with CDS having in-frame stop codons (requires -g)\n",
      " -H   for -V option, check and adjust the starting CDS phase\n",
      "      if the original phase leads to a translation with an \n",
      "      in-frame stop codon\n",
      " -B   for -V option, single-exon transcripts are also checked on the\n",
      "      opposite strand (requires -g)\n",
      " -P   add transcript level GFF attributes about the coding status of each\n",
      "      transcript, including partialness or in-frame stop codons (requires -g)\n",
      " --add-hasCDS : add a \"hasCDS\" attribute with value \"true\" for transcripts\n",
      "      that have CDS features\n",
      " --adj-stop stop codon adjustment: enables -P and performs automatic\n",
      "      adjustment of the CDS stop coordinate if premature or downstream\n",
      " -N   discard multi-exon mRNAs that have any intron with a non-canonical\n",
      "      splice site consensus (i.e. not GT-AG, GC-AG or AT-AC)\n",
      " -J   discard any mRNAs that either lack initial START codon\n",
      "      or the terminal STOP codon, or have an in-frame stop codon\n",
      "      (i.e. only print mRNAs with a complete CDS)\n",
      " --no-pseudo: filter out records matching the 'pseudo' keyword\n",
      " --in-bed: input should be parsed as BED format (automatic if the input\n",
      "           filename ends with .bed*)\n",
      " --in-tlf: input GFF-like one-line-per-transcript format without exon/CDS\n",
      "           features (see --tlf option below); automatic if the input\n",
      "           filename ends with .tlf)\n",
      " --stream: fast processing of input GFF/BED transcripts as they are received\n",
      "           ((no sorting, exons must be grouped by transcript in the input data)\n",
      "Clustering:\n",
      " -M/--merge : cluster the input transcripts into loci, discarding\n",
      "      \"redundant\" transcripts (those with the same exact introns\n",
      "      and fully contained or equal boundaries)\n",
      " -d <dupinfo> : for -M option, write duplication info to file <dupinfo>\n",
      " --cluster-only: same as -M/--merge but without discarding any of the\n",
      "      \"duplicate\" transcripts, only create \"locus\" features\n",
      " -K   for -M option: also discard as redundant the shorter, fully contained\n",
      "       transcripts (intron chains matching a part of the container)\n",
      " -Q   for -M option, no longer require boundary containment when assessing\n",
      "      redundancy (can be combined with -K); only introns have to match for\n",
      "      multi-exon transcripts, and >=80% overlap for single-exon transcripts\n",
      " -Y   for -M option, enforce -Q but also discard overlapping single-exon \n",
      "      transcripts, even on the opposite strand (can be combined with -K)\n",
      "Output options:\n",
      " --force-exons: make sure that the lowest level GFF features are considered\n",
      "       \"exon\" features\n",
      " --gene2exon: for single-line genes not parenting any transcripts, add an\n",
      "       exon feature spanning the entire gene (treat it as a transcript)\n",
      " --t-adopt:  try to find a parent gene overlapping/containing a transcript\n",
      "       that does not have any explicit gene Parent\n",
      " -D    decode url encoded characters within attributes\n",
      " -Z    merge very close exons into a single exon (when intron size<4)\n",
      " -g   full path to a multi-fasta file with the genomic sequences\n",
      "      for all input mappings, OR a directory with single-fasta files\n",
      "      (one per genomic sequence, with file names matching sequence names)\n",
      " -j    output the junctions and the corresponding transcripts\n",
      " -w    write a fasta file with spliced exons for each transcript\n",
      " --w-add <N> for the -w option, extract additional <N> bases\n",
      "       both upstream and downstream of the transcript boundaries\n",
      " --w-nocds for -w, disable the output of CDS info in the FASTA file\n",
      " -x    write a fasta file with spliced CDS for each GFF transcript\n",
      " -y    write a protein fasta file with the translation of CDS for each record\n",
      " -W    for -w, -x and -y options, write in the FASTA defline all the exon\n",
      "       coordinates projected onto the spliced sequence;\n",
      " -S    for -y option, use '*' instead of '.' as stop codon translation\n",
      " -L    Ensembl GTF to GFF3 conversion, adds version to IDs\n",
      " -m    <chr_replace> is a name mapping table for converting reference \n",
      "       sequence names, having this 2-column format:\n",
      "       <original_ref_ID> <new_ref_ID>\n",
      " -t    use <trackname> in the 2nd column of each GFF/GTF output line\n",
      " -o    write the output records into <outfile> instead of stdout\n",
      " -T    main output will be GTF instead of GFF3\n",
      " --bed output records in BED format instead of default GFF3\n",
      " --tlf output \"transcript line format\" which is like GFF\n",
      "       but with exons and CDS related features stored as GFF \n",
      "       attributes in the transcript feature line, like this:\n",
      "         exoncount=N;exons=<exons>;CDSphase=<N>;CDS=<CDScoords> \n",
      "       <exons> is a comma-delimited list of exon_start-exon_end coordinates;\n",
      "       <CDScoords> is CDS_start:CDS_end coordinates or a list like <exons>\n",
      " --table output a simple tab delimited format instead of GFF, with columns\n",
      "       having the values of GFF attributes given in <attrlist>; special\n",
      "       pseudo-attributes (prefixed by @) are recognized:\n",
      "       @id, @geneid, @chr, @start, @end, @strand, @numexons, @exons, \n",
      "       @cds, @covlen, @cdslen\n",
      "       If any of -w/-y/-x FASTA output files are enabled, the same fields\n",
      "       (excluding @id) are appended to the definition line of corresponding\n",
      "       FASTA records\n",
      " -v,-E expose (warn about) duplicate transcript IDs and other potential\n",
      "       problems with the given GFF/GTF records\n"
     ]
    },
    {
     "ename": "CalledProcessError",
     "evalue": "Command 'b'${gffread} -h\\n'' returned non-zero exit status 1.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mCalledProcessError\u001b[0m                        Traceback (most recent call last)",
      "\u001b[0;32m/tmp/ipykernel_68870/1000630337.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_cell_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'bash'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'${gffread} -h\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/programs/miniconda3/envs/gffutils_env/lib/python3.9/site-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mrun_cell_magic\u001b[0;34m(self, magic_name, line, cell)\u001b[0m\n\u001b[1;32m   2417\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuiltin_trap\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2418\u001b[0m                 \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mmagic_arg_s\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2419\u001b[0;31m                 \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2420\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2421\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/programs/miniconda3/envs/gffutils_env/lib/python3.9/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mnamed_script_magic\u001b[0;34m(line, cell)\u001b[0m\n\u001b[1;32m    140\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    141\u001b[0m                 \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscript\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 142\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshebang\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    144\u001b[0m         \u001b[0;31m# write a basic docstring:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/programs/miniconda3/envs/gffutils_env/lib/python3.9/site-packages/decorator.py\u001b[0m in \u001b[0;36mfun\u001b[0;34m(*args, **kw)\u001b[0m\n\u001b[1;32m    230\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mkwsyntax\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    231\u001b[0m                 \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkw\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkw\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 232\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mcaller\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mextras\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    233\u001b[0m     \u001b[0mfun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    234\u001b[0m     \u001b[0mfun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__doc__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__doc__\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/programs/miniconda3/envs/gffutils_env/lib/python3.9/site-packages/IPython/core/magic.py\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(f, *a, **k)\u001b[0m\n\u001b[1;32m    185\u001b[0m     \u001b[0;31m# but it's overkill for just that one bit of state.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    186\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mmagic_deco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 187\u001b[0;31m         \u001b[0mcall\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    188\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    189\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mcallable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/programs/miniconda3/envs/gffutils_env/lib/python3.9/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mshebang\u001b[0;34m(self, line, cell)\u001b[0m\n\u001b[1;32m    243\u001b[0m             \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflush\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    244\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraise_error\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m!=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 245\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mCalledProcessError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    246\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    247\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_run_script\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mto_close\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mCalledProcessError\u001b[0m: Command 'b'${gffread} -h\\n'' returned non-zero exit status 1."
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "${gffread} -h"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5740d8c5-d35d-425d-849f-7e28f02e49eb",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}