{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Notebook for getting count matrix for 4 libraries to be used in `DESeq2` from crab transcriptome v 3.1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Based on this notebook from Steven Roberts:  https://github.com/sr320/nb-2020/blob/master/C_bairdi/20-kallisto.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "kallisto 0.46.1\r\n",
      "\r\n",
      "Usage: kallisto <CMD> [arguments] ..\r\n",
      "\r\n",
      "Where <CMD> can be one of:\r\n",
      "\r\n",
      "    index         Builds a kallisto index \r\n",
      "    quant         Runs the quantification algorithm \r\n",
      "    bus           Generate BUS files for single-cell data \r\n",
      "    pseudo        Runs the pseudoalignment step \r\n",
      "    merge         Merges several batch runs \r\n",
      "    h5dump        Converts HDF5-formatted results to plaintext\r\n",
      "    inspect       Inspects and gives information about an index\r\n",
      "    version       Prints version information\r\n",
      "    cite          Prints citation information\r\n",
      "\r\n",
      "Running kallisto <CMD> without arguments prints usage information for <CMD>\r\n",
      "\r\n"
     ]
    }
   ],
   "source": [
    "!/Applications/bioinfo/kallisto/kallisto"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/graciecrandall/Documents/GitHub/paper-tanner-crab/notebooks\r\n"
     ]
    }
   ],
   "source": [
    "!pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cbai_transcriptome_31          kallisto-4libraries.ipynb\r\n",
      "cbai_transcriptome_v3.1.fasta  readme.md\r\n",
      "\u001b[34mkallisto\u001b[m\u001b[m/\r\n"
     ]
    }
   ],
   "source": [
    "ls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2020-08-12 11:58:06--  https://owl.fish.washington.edu/halfshell/genomic-databank/cbai_transcriptome_v3.1.fasta\n",
      "Resolving owl.fish.washington.edu (owl.fish.washington.edu)... 128.95.149.83\n",
      "Connecting to owl.fish.washington.edu (owl.fish.washington.edu)|128.95.149.83|:443... connected.\n",
      "WARNING: cannot verify owl.fish.washington.edu's certificate, issued by ‘CN=InCommon RSA Server CA,OU=InCommon,O=Internet2,L=Ann Arbor,ST=MI,C=US’:\n",
      "  Unable to locally verify the issuer's authority.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 145648199 (139M)\n",
      "Saving to: ‘cbai_transcriptome_v3.1.fasta’\n",
      "\n",
      "cbai_transcriptome_ 100%[===================>] 138.90M  10.4MB/s    in 13s     \n",
      "\n",
      "2020-08-12 11:58:21 (10.5 MB/s) - ‘cbai_transcriptome_v3.1.fasta’ saved [145648199/145648199]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget --no-check-certificate https://owl.fish.washington.edu/halfshell/genomic-databank/cbai_transcriptome_v3.1.fasta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "[build] loading fasta file cbai_transcriptome_v3.1.fasta\n",
      "[build] k-mer length: 31\n",
      "[build] warning: clipped off poly-A tail (longer than 10)\n",
      "        from 12560 target sequences\n",
      "[build] counting k-mers ... done.\n",
      "[build] building target de Bruijn graph ...  done \n",
      "[build] creating equivalence classes ...  done\n",
      "[build] target de Bruijn graph has 303833 contigs and contains 43881871 k-mers \n",
      "\n"
     ]
    }
   ],
   "source": [
    "!/Applications/bioinfo/kallisto/kallisto \\\n",
    "index -i cbai_transcriptome_31 \\\n",
    "cbai_transcriptome_v3.1.fasta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "[quant] fragment length distribution will be estimated from the data\n",
      "[index] k-mer length: 31\n",
      "[index] number of targets: 78,649\n",
      "[index] number of k-mers: 43,881,871\n",
      "[index] number of equivalence classes: 213,585\n",
      "[quant] running in paired-end mode\n",
      "[quant] will process pair 1: /Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380822_S3_L001_R1_001.fastp-trim.202004144409.fq.gz\n",
      "                             /Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380822_S3_L001_R2_001.fastp-trim.202004144409.fq.gz\n",
      "[quant] finding pseudoalignments for the reads ... done\n",
      "[quant] processed 9,369,940 reads, 6,141,882 reads pseudoaligned\n",
      "[quant] estimated average fragment length: 160.365\n",
      "[   em] quantifying the abundances ... done\n",
      "[   em] the Expectation-Maximization algorithm ran for 1,119 rounds\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!/Applications/bioinfo/kallisto/kallisto quant \\\n",
    "-i /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/notebooks/cbai_transcriptome_31 \\\n",
    "-t 4 \\\n",
    "-o kallisto/380822_cold_uninfected/ \\\n",
    "/Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380822_S3_L001_R1_001.fastp-trim.202004144409.fq.gz /Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380822_S3_L001_R2_001.fastp-trim.202004144409.fq.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "[quant] fragment length distribution will be estimated from the data\n",
      "[index] k-mer length: 31\n",
      "[index] number of targets: 78,649\n",
      "[index] number of k-mers: 43,881,871\n",
      "[index] number of equivalence classes: 213,585\n",
      "[quant] running in paired-end mode\n",
      "[quant] will process pair 1: /Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380823_S4_L001_R1_001.fastp-trim.202004144852.fq.gz\n",
      "                             /Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380823_S4_L001_R2_001.fastp-trim.202004144852.fq.gz\n",
      "[quant] finding pseudoalignments for the reads ... done\n",
      "[quant] processed 8,760,917 reads, 5,426,016 reads pseudoaligned\n",
      "[quant] estimated average fragment length: 164.564\n",
      "[   em] quantifying the abundances ... done\n",
      "[   em] the Expectation-Maximization algorithm ran for 1,249 rounds\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!/Applications/bioinfo/kallisto/kallisto quant \\\n",
    "-i /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/notebooks/cbai_transcriptome_31 \\\n",
    "-t 4 \\\n",
    "-o kallisto/380823_cold_infected/ \\\n",
    "/Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380823_S4_L001_R1_001.fastp-trim.202004144852.fq.gz /Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380823_S4_L001_R2_001.fastp-trim.202004144852.fq.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "[quant] fragment length distribution will be estimated from the data\n",
      "[index] k-mer length: 31\n",
      "[index] number of targets: 78,649\n",
      "[index] number of k-mers: 43,881,871\n",
      "[index] number of equivalence classes: 213,585\n",
      "[quant] running in paired-end mode\n",
      "[quant] will process pair 1: /Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380824_S5_L001_R2_001.fastp-trim.202004145320.fq.gz\n",
      "                             /Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380824_S5_L001_R1_001.fastp-trim.202004145320.fq.gz\n",
      "[quant] finding pseudoalignments for the reads ... done\n",
      "[quant] processed 9,861,054 reads, 6,460,364 reads pseudoaligned\n",
      "[quant] estimated average fragment length: 173.524\n",
      "[   em] quantifying the abundances ... done\n",
      "[   em] the Expectation-Maximization algorithm ran for 1,206 rounds\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!/Applications/bioinfo/kallisto/kallisto quant \\\n",
    "-i /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/notebooks/cbai_transcriptome_31 \\\n",
    "-t 4 \\\n",
    "-o kallisto/380824_warm_uninfected/ \\\n",
    "/Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380824_S5_L001_R2_001.fastp-trim.202004145320.fq.gz /Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380824_S5_L001_R1_001.fastp-trim.202004145320.fq.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "[quant] fragment length distribution will be estimated from the data\n",
      "[index] k-mer length: 31\n",
      "[index] number of targets: 78,649\n",
      "[index] number of k-mers: 43,881,871\n",
      "[index] number of equivalence classes: 213,585\n",
      "[quant] running in paired-end mode\n",
      "[quant] will process pair 1: /Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380825_S6_L001_R1_001.fastp-trim.202004145835.fq.gz\n",
      "                             /Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380825_S6_L001_R2_001.fastp-trim.202004145835.fq.gz\n",
      "[quant] finding pseudoalignments for the reads ... done\n",
      "[quant] processed 10,184,344 reads, 6,441,322 reads pseudoaligned\n",
      "[quant] estimated average fragment length: 160.804\n",
      "[   em] quantifying the abundances ... done\n",
      "[   em] the Expectation-Maximization algorithm ran for 1,259 rounds\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!/Applications/bioinfo/kallisto/kallisto quant \\\n",
    "-i /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/notebooks/cbai_transcriptome_31 \\\n",
    "-t 4 \\\n",
    "-o kallisto/380825_warm_infected/ \\\n",
    "/Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380825_S6_L001_R1_001.fastp-trim.202004145835.fq.gz /Volumes/web/Atumefaciens/20200414_cbai_RNAseq_fastp_trimming/380825_S6_L001_R2_001.fastp-trim.202004145835.fq.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "####################################################################################\n",
      "#\n",
      "# Usage:  /Applications/bioinfo/trinityrnaseq-v2.8.6/util/abundance_estimates_to_matrix.pl --est_method <method>  sample1.results sample2.results ...\n",
      "#\n",
      "#      or  /Applications/bioinfo/trinityrnaseq-v2.8.6/util/abundance_estimates_to_matrix.pl --est_method <method> --quant_files file.listing_target_files.txt\n",
      "#\n",
      "#      Note, if only a single input file is given, it's expected to contain the paths to all the target abundance estimation files.\n",
      "#\n",
      "# Required:\n",
      "#            \n",
      "#  --est_method <string>           RSEM|eXpress|kallisto|salmon  (needs to know what format to expect)\n",
      "#\n",
      "#  --gene_trans_map <string>           the gene-to-transcript mapping file. (if you don't want gene estimates, indicate 'none'.\n",
      "#\n",
      "#\n",
      "# Options:\n",
      "#\n",
      "#  --cross_sample_norm <string>         TMM|UpperQuartile|none   (default: TMM)\n",
      "#\n",
      "#  --name_sample_by_basedir             name sample column by dirname instead of filename\n",
      "#      --basedir_index <int>            default(-2)\n",
      "#\n",
      "#  --out_prefix <string>                default: value for --est_method\n",
      "#\n",
      "#  --quant_files <string>              file containing a list of all the target files.\n",
      "#\n",
      "######################################################################################\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!perl /Applications/bioinfo/trinityrnaseq-v2.8.6/util/abundance_estimates_to_matrix.pl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-reading file: /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/notebooks/kallisto/380822_cold_uninfected/abundance.tsv\n",
      "-reading file: /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/notebooks/kallisto/380823_cold_infected/abundance.tsv\n",
      "-reading file: /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/notebooks/kallisto/380824_warm_uninfected/abundance.tsv\n",
      "-reading file: /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/notebooks/kallisto/380825_warm_infected/abundance.tsv\n",
      "\n",
      "\n",
      "* Outputting combined matrix.\n",
      "\n",
      "/Applications/bioinfo/trinityrnaseq-v2.8.6/util/support_scripts/run_TMM_scale_matrix.pl --matrix /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/analyses/kallisto-0812/kallisto-0812.isoform.TPM.not_cross_norm > /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/analyses/kallisto-0812/kallisto-0812.isoform.TMM.EXPR.matrixCMD: R --no-save --no-restore --no-site-file --no-init-file -q < /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/analyses/kallisto-0812/kallisto-0812.isoform.TPM.not_cross_norm.runTMM.R 1>&2 \n",
      "dyld: Library not loaded: @rpath/libreadline.6.2.dylib\n",
      "  Referenced from: /Users/graciecrandall/anaconda/lib/R/lib/libR.dylib\n",
      "  Reason: image not found\n",
      "sh: line 1:  5170 Abort trap: 6           R --no-save --no-restore --no-site-file --no-init-file -q < /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/analyses/kallisto-0812/kallisto-0812.isoform.TPM.not_cross_norm.runTMM.R 1>&2\n",
      "Error, cmd: R --no-save --no-restore --no-site-file --no-init-file -q < /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/analyses/kallisto-0812/kallisto-0812.isoform.TPM.not_cross_norm.runTMM.R 1>&2  died with ret (34304)  at /Applications/bioinfo/trinityrnaseq-v2.8.6/util/support_scripts/run_TMM_scale_matrix.pl line 105.\n",
      "Error, CMD: /Applications/bioinfo/trinityrnaseq-v2.8.6/util/support_scripts/run_TMM_scale_matrix.pl --matrix /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/analyses/kallisto-0812/kallisto-0812.isoform.TPM.not_cross_norm > /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/analyses/kallisto-0812/kallisto-0812.isoform.TMM.EXPR.matrix died with ret 6400 at /Applications/bioinfo/trinityrnaseq-v2.8.6/util/abundance_estimates_to_matrix.pl line 385.\n"
     ]
    }
   ],
   "source": [
    "!perl /Applications/bioinfo/trinityrnaseq-v2.8.6/util/abundance_estimates_to_matrix.pl \\\n",
    "--est_method kallisto \\\n",
    "    --gene_trans_map none \\\n",
    "    --out_prefix /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/analyses/kallisto-0812/kallisto-0812 \\\n",
    "    --name_sample_by_basedir \\\n",
    "     /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/notebooks/kallisto/380822_cold_uninfected/abundance.tsv \\\n",
    "     /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/notebooks/kallisto/380823_cold_infected/abundance.tsv \\\n",
    "     /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/notebooks/kallisto/380824_warm_uninfected/abundance.tsv \\\n",
    "     /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/notebooks/kallisto/380825_warm_infected/abundance.tsv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\t380822_cold_uninfected\t380823_cold_infected\t380824_warm_uninfected\t380825_warm_infected\r\n",
      "TRINITY_DN9654_c0_g1_i1\t269\t195\t271\t245\r\n",
      "TRINITY_DN2423_c0_g1_i5\t70.0962\t54.7109\t113.331\t102.982\r\n",
      "TRINITY_DN35884_c0_g1_i1\t17\t11\t12\t9\r\n",
      "TRINITY_DN51908_c0_g1_i1\t3\t4\t7\t4\r\n",
      "TRINITY_DN10230_c0_g1_i5\t0\t0\t0\t0\r\n",
      "TRINITY_DN556_c1_g1_i4\t0\t23.6851\t29.7852\t9.53139\r\n",
      "TRINITY_DN6709_c0_g1_i8\t0\t0\t0\t0\r\n",
      "TRINITY_DN25396_c0_g1_i5\t0\t0\t0\t1.00319\r\n",
      "TRINITY_DN881_c3_g1_i1\t806\t864\t1449\t1267\r\n"
     ]
    }
   ],
   "source": [
    "!head /Users/graciecrandall/Documents/GitHub/paper-tanner-crab/analyses/kallisto-0812/kallisto-0812.isoform.counts.matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}