{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Notebook to subset 18 scaffolds from Pgenerosa_v070.fa\n", "\n", "#### Per this [GitHub issue](https://github.com/RobertsLab/resources/issues/705)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Tue Jun 25 12:51:41 PDT 2019\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.6 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "swoose\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 24\n", "On-line CPU(s) list: 0-23\n", "Thread(s) per core: 2\n", "Core(s) per socket: 6\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 44\n", "Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz\n", "Stepping: 2\n", "CPU MHz: 2926.094\n", "BogoMIPS: 5851.96\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 12288K\n", "NUMA node0 CPU(s): 0-23\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 70G 11G 10G 553M 49G 58G\n", "Swap: 4.7G 292M 4.4G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: work_dir=/home/sam/analyses/20190625_pgen_v070_scaffold_subsetting\n" ] } ], "source": [ "# Set working directory - %env is useful for bash\n", "%env work_dir = /home/sam/analyses/20190625_pgen_v070_scaffold_subsetting\n", "work_dir = \"/home/sam/analyses/20190625_pgen_v070_scaffold_subsetting\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: samtools=/home/sam/programs/samtools-1.9/samtools\n" ] } ], "source": [ "# Set samtools path - %env is useful for bash\n", "%env samtools = /home/sam/programs/samtools-1.9/samtools" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: fasta_subset_names=fasta_subset_names.txt\n", "env: out_fasta=Pgenerosa_v070.18.fa\n", "env: p70_fasta=Pgenerosa_v070.fa\n", "env: p70_fai=Pgenerosa_v070.fa.fai\n" ] } ], "source": [ "# Set output filenames\n", "%env fasta_subset_names=fasta_subset_names.txt\n", "%env out_fasta=Pgenerosa_v070.18.fa\n", "%env p70_fasta=Pgenerosa_v070.fa\n", "%env p70_fai=Pgenerosa_v070.fa.fai" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "mkdir \"${work_dir}\"" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/sam/analyses/20190625_pgen_v070_scaffold_subsetting\n" ] } ], "source": [ "cd $work_dir" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Download files\n", "\n", "Downloaded via `rsync` - uncomment `wget` lines if needed." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "receiving incremental file list\n", "Pgenerosa_v070.fa\n", "\n", "sent 30 bytes received 2,247,392,295 bytes 15,995,674.91 bytes/sec\n", "total size is 2,247,117,885 speedup is 1.00\n", "receiving incremental file list\n", "Pgenerosa_v070.fa.fai\n", "\n", "sent 30 bytes received 20,294,786 bytes 5,798,518.86 bytes/sec\n", "total size is 20,292,201 speedup is 1.00\n", "-----------------------\n", "\n", "total 2.2G\n", "-rw-r--r-- 1 sam users 2.1G Feb 11 12:13 Pgenerosa_v070.fa\n", "-rw-rw-rw- 1 sam users 20M Feb 11 13:49 Pgenerosa_v070.fa.fai\n" ] } ], "source": [ "%%bash\n", "rsync -av owl:/volume1/web/halfshell/genomic-databank/Pgenerosa_v070.fa .\n", "\n", "rsync -av owl:/volume1/web/halfshell/genomic-databank/Pgenerosa_v070.fa.fai .\n", " \n", "#wget http://owl.fish.washington.edu/halfshell/genomic-databank/Pgenerosa_v070.fa\n", "#wget http://owl.fish.washington.edu/halfshell/genomic-databank/Pgenerosa_v070.fa.fai\n", "\n", "echo \"-----------------------\"\n", "echo \"\"\n", "ls -ltrh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check FastA index file" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PGA_scaffold1__77_contigs__length_89643857\t89643857\t44\t80\t81\n", "PGA_scaffold2__36_contigs__length_69596280\t69596280\t90764494\t80\t81\n", "PGA_scaffold3__111_contigs__length_57743597\t57743597\t161230773\t80\t81\n", "PGA_scaffold4__129_contigs__length_65288255\t65288255\t219696210\t80\t81\n", "PGA_scaffold5__109_contigs__length_67248332\t67248332\t285800614\t80\t81\n", "PGA_scaffold6__104_contigs__length_61759565\t61759565\t353889596\t80\t81\n", "PGA_scaffold7__69_contigs__length_43120122\t43120122\t416421200\t80\t81\n", "PGA_scaffold8__63_contigs__length_61151155\t61151155\t460080368\t80\t81\n", "PGA_scaffold9__45_contigs__length_38581958\t38581958\t521995957\t80\t81\n", "PGA_scaffold10__49_contigs__length_53961475\t53961475\t561060235\t80\t81\n", "PGA_scaffold11__79_contigs__length_51449921\t51449921\t615696274\t80\t81\n", "PGA_scaffold12__71_contigs__length_50438331\t50438331\t667789365\t80\t81\n", "PGA_scaffold13__52_contigs__length_44396874\t44396874\t718858221\t80\t81\n", "PGA_scaffold14__91_contigs__length_45393038\t45393038\t763810101\t80\t81\n", "PGA_scaffold15__101_contigs__length_47938513\t47938513\t809770598\t80\t81\n", "PGA_scaffold16__33_contigs__length_31980953\t31980953\t858308388\t80\t81\n", "PGA_scaffold17__51_contigs__length_34923512\t34923512\t890689148\t80\t81\n", "PGA_scaffold18__69_contigs__length_27737463\t27737463\t926049249\t80\t81\n", "PGA_scaffold19__1_contigs__length_6170\t6170\t954133471\t80\t81\n", "PGA_scaffold20__1_contigs__length_7749\t7749\t954139759\t80\t81\n" ] } ], "source": [ "%%bash\n", "head -n 20 \"${p70_fai}\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Extract IDs for top 18 scaffolds" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PGA_scaffold1__77_contigs__length_89643857\n", "PGA_scaffold2__36_contigs__length_69596280\n", "PGA_scaffold3__111_contigs__length_57743597\n", "PGA_scaffold4__129_contigs__length_65288255\n", "PGA_scaffold5__109_contigs__length_67248332\n", "PGA_scaffold6__104_contigs__length_61759565\n", "PGA_scaffold7__69_contigs__length_43120122\n", "PGA_scaffold8__63_contigs__length_61151155\n", "PGA_scaffold9__45_contigs__length_38581958\n", "PGA_scaffold10__49_contigs__length_53961475\n", "PGA_scaffold11__79_contigs__length_51449921\n", "PGA_scaffold12__71_contigs__length_50438331\n", "PGA_scaffold13__52_contigs__length_44396874\n", "PGA_scaffold14__91_contigs__length_45393038\n", "PGA_scaffold15__101_contigs__length_47938513\n", "PGA_scaffold16__33_contigs__length_31980953\n", "PGA_scaffold17__51_contigs__length_34923512\n", "PGA_scaffold18__69_contigs__length_27737463\n" ] } ], "source": [ "%%bash\n", "awk -F'\\t' '{print $1}' \"${p70_fai}\" \\\n", "| head -n 18 \\\n", "> \"${fasta_subset_names}\"\n", "\n", "cat \"${fasta_subset_names}\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Extract FastAs for top 18 scaffolds" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "18\n" ] } ], "source": [ "%%bash\n", "xargs \"${samtools}\" faidx \"${p70_fasta}\" < \"${fasta_subset_names}\" > \"${out_fasta}\"\n", "\n", "grep -c \">\" \"${out_fasta}\"" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 3.1G\n", "-rw-r--r-- 1 sam users 2.1G Feb 11 12:13 Pgenerosa_v070.fa\n", "-rw-rw-rw- 1 sam users 20M Feb 11 13:49 Pgenerosa_v070.fa.fai\n", "-rw-rw-r-- 1 sam sam 788 Jun 25 12:55 fasta_subset_names.txt\n", "-rw-rw-r-- 1 sam sam 914M Jun 25 12:55 Pgenerosa_v070.18.fa\n", "-rw-rw-r-- 1 sam sam 1.3K Jun 25 12:56 Pgenerosa_v070.18.fa.fai\n" ] } ], "source": [ "%%bash\n", "\"${samtools}\" faidx \"${out_fasta}\"\n", "\n", "ls -ltrh" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 914M\n", "-rw-rw-r-- 1 sam sam 788 Jun 25 12:55 fasta_subset_names.txt\n", "-rw-rw-r-- 1 sam sam 914M Jun 25 12:55 Pgenerosa_v070.18.fa\n", "-rw-rw-r-- 1 sam sam 1.3K Jun 25 12:56 Pgenerosa_v070.18.fa.fai\n" ] } ], "source": [ "%%bash\n", "rm \"${p70_fasta}\" \"${p70_fai}\"\n", "\n", "ls -ltrh" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }