{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Tue Mar 3 08:05:13 PST 2020\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.6 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "swoose\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 24\n", "On-line CPU(s) list: 0-23\n", "Thread(s) per core: 2\n", "Core(s) per socket: 6\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 44\n", "Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz\n", "Stepping: 2\n", "CPU MHz: 2925.931\n", "BogoMIPS: 5851.96\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 12288K\n", "NUMA node0 CPU(s): 0-23\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 70G 5.6G 51G 626M 14G 63G\n", "Swap: 4.7G 0B 4.7G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set variables\n", "`%env` variables are good for passing to bash cells" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: wd=/home/sam/analyses/20200228_swoose_olur_v081_fasta_renaming\n", "env: rsync_owl=owl:/volume1/web/halfshell/genomic-databank/\n", "env: wget_command=--directory-prefix=$/home/sam/analyses/20200228_swoose_olur_v081_fasta_renaming --quiety --no-directories --no-check-certificate https://owl.fish.washington.edu/halfshell/genomic-databank/\n", "env: og_fasta=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta\n", "env: og_fai=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta.fai\n", "env: og_gff=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff\n", "env: gene_gff=Olurida_v081-20190709.gene.gff\n", "env: genes_fasta=Olurida_v081.bedtools.genes.fasta\n", "env: genes_fai=Olurida_v081.bedtools.genes.fasta.fai\n", "env: final_genes_fasta=Olurida_v081.genes.fasta\n", "env: final_genes_fai=Olurida_v081.genes.fasta.fai\n", "env: getfasta=/home/sam/programs/bedtools-2.28.0/bin/fastaFromBed\n", "env: samtools=/home/sam/programs/samtools-1.9/samtools\n" ] } ], "source": [ "# Set workding directory\n", "%env wd=/home/sam/analyses/20200228_swoose_olur_v081_fasta_renaming\n", "wd=\"/home/sam/analyses/20200228_swoose_olur_v081_fasta_renaming\"\n", "\n", "%env rsync_owl=owl:/volume1/web/halfshell/genomic-databank/\n", "%env wget_command=--directory-prefix=${wd} --quiety --no-directories --no-check-certificate https://owl.fish.washington.edu/halfshell/genomic-databank/\n", "\n", "%env og_fasta=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta\n", "%env og_fai=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta.fai\n", "%env og_gff=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff\n", "%env gene_gff=Olurida_v081-20190709.gene.gff\n", "%env genes_fasta=Olurida_v081.bedtools.genes.fasta\n", "%env genes_fai=Olurida_v081.bedtools.genes.fasta.fai\n", "%env final_genes_fasta=Olurida_v081.genes.fasta\n", "%env final_genes_fai=Olurida_v081.genes.fasta.fai\n", "\n", "%env getfasta=/home/sam/programs/bedtools-2.28.0/bin/fastaFromBed\n", "%env samtools=/home/sam/programs/samtools-1.9/samtools" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create necessary directories" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true } }, "outputs": [], "source": [ "%%bash\n", "mkdir --parents ${wd}" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/sam/analyses/20200228_swoose_olur_v081_fasta_renaming\n" ] } ], "source": [ "cd {wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Download Olur_v081 genes gff\n", "\n", "Taken from: https://owl.fish.washington.edu/halfshell/genomic-databank/" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "receiving incremental file list\n", "Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff\n", " 3,104,658,743 100% 24.69MB/s 0:01:59 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 3,105,037,889 bytes 24,353,238.58 bytes/sec\n", "total size is 3,104,658,743 speedup is 1.00\n", "receiving incremental file list\n", "Olurida_v081-20190709.gene.gff\n", " 9,248,086 100% 38.18MB/s 0:00:00 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 9,249,332 bytes 2,642,674.86 bytes/sec\n", "total size is 9,248,086 speedup is 1.00\n", "total 3.0G\n", "-rw-r--r-- 1 sam users 8.9M Jul 16 2019 Olurida_v081-20190709.gene.gff\n", "-rw-rw-r-- 1 sam users 2.9G Dec 13 21:30 Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff\n" ] } ], "source": [ "%%bash\n", "# Create array of files from list\n", "files_array=(Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff Olurida_v081-20190709.gene.gff)\n", "\n", "\n", "for file in \"${files_array[@]}\"\n", "do\n", " rsync \\\n", " --archive \\\n", " --progress \\\n", " --verbose \\\n", " \"${rsync_owl}${file}\" \\\n", " .\n", "done\n", "\n", "ls -lh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Extract FastA from MAKER GFF" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "\n", "# Find the first line in the file that begins with \">\"\n", "# and print to the end of the file ('p' enables printing the first matching line, which would be skipped by default)\n", "sed --quiet '/^>/,$p' ${og_gff} > ${og_fasta}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Compare number of FastA header lines" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff:159429\n", "Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:159429\n" ] } ], "source": [ "%%bash\n", "\n", "grep --with-filename --count \"^>\" ${og_gff}\n", "grep --with-filename --count \"^>\" ${og_fasta}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create FastA index file" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Contig56127\t12532\t13\t60\t61\n", "Contig81695\t22750\t12767\t60\t61\n", "Contig130560\t1525\t35911\t60\t61\n", "Contig155059\t2121\t37476\t60\t61\n", "Contig61093\t10407\t39646\t60\t61\n", "Contig79811\t2248\t50240\t60\t61\n", "Contig89862\t14043\t52539\t60\t61\n", "Contig1111\t28792\t66829\t60\t61\n", "Contig160984\t4700\t96115\t60\t61\n", "Contig214118\t1068\t100908\t60\t61\n" ] } ], "source": [ "%%bash\n", "\n", "${samtools} faidx ${og_fasta} > ${og_fai}\n", "\n", "head ${og_fai}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Extract just gene seqences\n", "\n", "Splits input FastA based on GFF coordinates" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">Contig61093:7492-7946\n", ">Contig1111:24967-28696\n", ">Contig214118:200-926\n", ">Contig58217:9735-11541\n", ">Contig2046:2294-18394\n", ">Contig9540:4302-10179\n", ">Contig52254:8907-11733\n", ">Contig36645:2184-3340\n", ">Contig3008:531-6482\n", ">Contig67269:2368-12743\n" ] } ], "source": [ "%%bash\n", "${getfasta} -fi ${og_fasta} -bed ${gene_gff} > ${genes_fasta}\n", "\n", "grep \">\" ${genes_fasta} | head" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Change scaffold names and file names" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "bash: line 20: syntax error near unexpected token `done'\n", "bash: line 20: `done'\n" ] }, { "ename": "CalledProcessError", "evalue": "Command 'b'\\n# Array of \"old\" scaffold names\\n# Formats names to match FastA headers.\\n# Subtracts \"1\" from the start position to match bedTools 0-based counting\\nmapfile -t orig_scaffold_names < <(awk -F\"\\\\t\" \\'NR > 1 {print $1\":\"$4-1\"-\"$5}\\' ${gene_gff})\\n\\n# Array of new scaffold names\\n# Separators set as \"=\" and \";\" to pull out new IDs\\n# NR > 1 skips the first line (i.e. header)\\nmapfile -t new_scaffold_names < <(awk -F\"[=;]\" \\'NR > 1 {print $4}\\' ${gene_gff})\\n\\n# sed substitution\\n# creates sed script to find original scaffold names and replace them with new scafold names\\n# and passes to sed via stdin\\nfor index in \"${!orig_scaffold_names[@]}\"\\n do\\n printf \"s/%s/%s/\\\\n\" \"${orig_scaffold_names[index]}\" \"${new_scaffold_names[index]}\"\\n done | sed --file - \"${genes_fasta}\" \\\\\\n >> \"${final_genes_fasta}\"\\ndone\\n\\nls -ltrh\\n'' returned non-zero exit status 2.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mCalledProcessError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_cell_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'bash'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'\\n# Array of \"old\" scaffold names\\n# Formats names to match FastA headers.\\n# Subtracts \"1\" from the start position to match bedTools 0-based counting\\nmapfile -t orig_scaffold_names < <(awk -F\"\\\\t\" \\'NR > 1 {print $1\":\"$4-1\"-\"$5}\\' ${gene_gff})\\n\\n# Array of new scaffold names\\n# Separators set as \"=\" and \";\" to pull out new IDs\\n# NR > 1 skips the first line (i.e. header)\\nmapfile -t new_scaffold_names < <(awk -F\"[=;]\" \\'NR > 1 {print $4}\\' ${gene_gff})\\n\\n# sed substitution\\n# creates sed script to find original scaffold names and replace them with new scafold names\\n# and passes to sed via stdin\\nfor index in \"${!orig_scaffold_names[@]}\"\\n do\\n printf \"s/%s/%s/\\\\n\" \"${orig_scaffold_names[index]}\" \"${new_scaffold_names[index]}\"\\n done | sed --file - \"${genes_fasta}\" \\\\\\n >> \"${final_genes_fasta}\"\\ndone\\n\\nls -ltrh\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mrun_cell_magic\u001b[0;34m(self, magic_name, line, cell)\u001b[0m\n\u001b[1;32m 2350\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuiltin_trap\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2351\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mmagic_arg_s\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2352\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2353\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2354\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mnamed_script_magic\u001b[0;34m(line, cell)\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 141\u001b[0m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscript\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 142\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshebang\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[0;31m# write a basic docstring:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mshebang\u001b[0;34m(self, line, cell)\u001b[0m\n", "\u001b[0;32m~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/magic.py\u001b[0m in \u001b[0;36m\u001b[0;34m(f, *a, **k)\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0;31m# but it's overkill for just that one bit of state.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmagic_deco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 187\u001b[0;31m \u001b[0mcall\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 188\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mshebang\u001b[0;34m(self, line, cell)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflush\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraise_error\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m!=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 245\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mCalledProcessError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 246\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_run_script\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mto_close\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mCalledProcessError\u001b[0m: Command 'b'\\n# Array of \"old\" scaffold names\\n# Formats names to match FastA headers.\\n# Subtracts \"1\" from the start position to match bedTools 0-based counting\\nmapfile -t orig_scaffold_names < <(awk -F\"\\\\t\" \\'NR > 1 {print $1\":\"$4-1\"-\"$5}\\' ${gene_gff})\\n\\n# Array of new scaffold names\\n# Separators set as \"=\" and \";\" to pull out new IDs\\n# NR > 1 skips the first line (i.e. header)\\nmapfile -t new_scaffold_names < <(awk -F\"[=;]\" \\'NR > 1 {print $4}\\' ${gene_gff})\\n\\n# sed substitution\\n# creates sed script to find original scaffold names and replace them with new scafold names\\n# and passes to sed via stdin\\nfor index in \"${!orig_scaffold_names[@]}\"\\n do\\n printf \"s/%s/%s/\\\\n\" \"${orig_scaffold_names[index]}\" \"${new_scaffold_names[index]}\"\\n done | sed --file - \"${genes_fasta}\" \\\\\\n >> \"${final_genes_fasta}\"\\ndone\\n\\nls -ltrh\\n'' returned non-zero exit status 2." ] } ], "source": [ "%%bash\n", "\n", "# Array of \"old\" scaffold names\n", "# Formats names to match FastA headers.\n", "# Subtracts \"1\" from the start position to match bedTools 0-based counting\n", "mapfile -t orig_scaffold_names < <(awk -F\"\\t\" 'NR > 1 {print $1\":\"$4-1\"-\"$5}' ${gene_gff})\n", "\n", "# Array of new scaffold names\n", "# Separators set as \"=\" and \";\" to pull out new IDs\n", "# NR > 1 skips the first line (i.e. header)\n", "mapfile -t new_scaffold_names < <(awk -F\"[=;]\" 'NR > 1 {print $4}' ${gene_gff})\n", "\n", "# sed substitution\n", "# creates sed script to find original scaffold names and replace them with new scafold names\n", "# and passes to sed via stdin\n", "for index in \"${!orig_scaffold_names[@]}\"\n", " do\n", " printf \"s/%s/%s/\\n\" \"${orig_scaffold_names[index]}\" \"${new_scaffold_names[index]}\"\n", " done | sed --file - \"${genes_fasta}\" \\\n", " >> \"${final_genes_fasta}\"\n", "done\n", "\n", "ls -ltrh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Hmmm, this threw an error? Weird.\n", "\n", "Also, this took a little over 17 _hours_ to run and then this happens??!! Maybe it would've run faster if I told it just to process lines that began with a \"`>`\", which would, theoreticaly, prevent `sed` from searching every sinlge line in the file? \n", "\n", "Ugh. Let's check 'em out to see if things look OK or not." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Check old and new FastAs to confirm substituions" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Olurida_v081.bedtools.genes.fasta:32210\n", "Olurida_v081.genes.fasta:32210\n" ] } ], "source": [ "%%bash\n", "\n", "grep --with-filename --count \"^>\" ${genes_fasta}\n", "\n", "grep --with-filenam --count \"^>\" ${final_genes_fasta}" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig56127\n", "Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig81695\n", "Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig130560\n", "Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig155059\n", "Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig61093\n", "Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig79811\n", "Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig89862\n", "Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig1111\n", "Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig160984\n", "Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.fasta:>Contig214118\n", "Olurida_v081.bedtools.genes.fasta:>Contig61093:7492-7946\n", "Olurida_v081.bedtools.genes.fasta:>Contig1111:24967-28696\n", "Olurida_v081.bedtools.genes.fasta:>Contig214118:200-926\n", "Olurida_v081.bedtools.genes.fasta:>Contig58217:9735-11541\n", "Olurida_v081.bedtools.genes.fasta:>Contig2046:2294-18394\n", "Olurida_v081.bedtools.genes.fasta:>Contig9540:4302-10179\n", "Olurida_v081.bedtools.genes.fasta:>Contig52254:8907-11733\n", "Olurida_v081.bedtools.genes.fasta:>Contig36645:2184-3340\n", "Olurida_v081.bedtools.genes.fasta:>Contig3008:531-6482\n", "Olurida_v081.bedtools.genes.fasta:>Contig67269:2368-12743\n", "Olurida_v081.genes.fasta:>OLUR_00020575\n", "Olurida_v081.genes.fasta:>OLUR_00006628\n", "Olurida_v081.genes.fasta:>OLUR_00032161\n", "Olurida_v081.genes.fasta:>OLUR_00019127\n", "Olurida_v081.genes.fasta:>OLUR_00011450\n", "Olurida_v081.genes.fasta:>OLUR_00018391\n", "Olurida_v081.genes.fasta:>OLUR_00011614\n", "Olurida_v081.genes.fasta:>OLUR_00022996\n", "Olurida_v081.genes.fasta:>OLUR_00018754\n", "Olurida_v081.genes.fasta:>OLUR_00017261\n" ] } ], "source": [ "%%bash\n", "\n", "grep --with-filename \"^>\" ${og_fasta} | head\n", "\n", "grep --with-filename \"^>\" ${genes_fasta} | head\n", "\n", "grep --with-filenam \"^>\" ${final_genes_fasta} | head" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Well, the counts and substitutions all look fine, so I guess we're good to go?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Make FastA index file for new FastA" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "OLUR_00020575\t454\t15\t454\t455\n", "OLUR_00006628\t3729\t485\t3729\t3730\n", "OLUR_00032161\t726\t4230\t726\t727\n", "OLUR_00019127\t1806\t4972\t1806\t1807\n", "OLUR_00011450\t16100\t6794\t16100\t16101\n", "OLUR_00018391\t5877\t22910\t5877\t5878\n", "OLUR_00011614\t2826\t28803\t2826\t2827\n", "OLUR_00022996\t1156\t31645\t1156\t1157\n", "OLUR_00018754\t5951\t32817\t5951\t5952\n", "OLUR_00017261\t10375\t38784\t10375\t10376\n" ] } ], "source": [ "%%bash\n", "\n", "${samtools} faidx ${final_genes_fasta} > ${final_genes_fai}\n", "\n", "head ${final_genes_fai}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cleanup" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 213M\n", "-rw-rw-r-- 1 sam sam 212M Mar 4 01:22 Olurida_v081.genes.fasta\n", "-rw-rw-r-- 1 sam sam 1.2M Mar 4 05:32 Olurida_v081.genes.fasta.fai\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "rm: cannot remove 'Olurida_v081.bedtools.genes.fasta.fai': No such file or directory\n" ] } ], "source": [ "%%bash\n", "rm ${genes_fasta} ${genes_fai} ${og_fasta} ${og_fai} ${og_gff} ${gene_gff}\n", "\n", "ls -ltrh" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 4 }