{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Tue Nov 5 07:39:45 PST 2019\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.6 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "swoose\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 24\n", "On-line CPU(s) list: 0-23\n", "Thread(s) per core: 2\n", "Core(s) per socket: 6\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 44\n", "Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz\n", "Stepping: 2\n", "CPU MHz: 2925.971\n", "BogoMIPS: 5851.97\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 12288K\n", "NUMA node0 CPU(s): 0-23\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 70G 29G 388M 575M 40G 39G\n", "Swap: 4.7G 548M 4.1G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set variables\n", "`%env` variables are good for passing to bash cells" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: wd=/home/sam/analyses/20191105_swoose_pgen_v074_renaming\n", "env: rsync_owl=owl:/volume1/web/halfshell/genomic-databank/\n", "env: files_list=Panopea-generosa-vv0.74.a4.CDS.gff3 Panopea-generosa-vv0.74.a4.exon.gff3 Panopea-generosa-vv0.74.a4.gene.gff3 Panopea-generosa-vv0.74.a4.mRNA.gff3 Panopea-generosa-vv0.74.a4.repeat_region.gff3 Panopea-generosa-vv0.74.a4.rRNA.gff3 Panopea-generosa-vv0.74.a4.tRNA.gff3 Pgenerosa_v074.fa Pgenerosa_v074.fa.fai Pgenerosa_v074.CpG.gff Panopea-generosa-vv0.74.a4.intergenic.bed Panopea-generosa-vv0.74.a4.introns.bed Panopea-generosa-vv0.74.a4.repeats.DNA.gff3 Panopea-generosa-vv0.74.a4.repeats.LTR.gff3 Panopea-generosa-vv0.74.a4.repeats.SINE.gff3 Panopea-generosa-vv0.74.a4.repeats.Unknown.gff3 Panopea-generosa-vv0.74.a4.repeats.LINE.gff3 Panopea-generosa-vv0.74.a4.repeats.RC.gff3 Panopea-generosa-vv0.74.a4.repeats.Simple_repeat.gff3\n", "env: wget_command=--directory-prefix=$/home/sam/analyses/20191105_swoose_pgen_v074_renaming --quiety --no-directories --no-check-certificate https://owl.fish.washington.edu/halfshell/genomic-databank/\n", "env: new_prefix=Panopea-generosa-v1.0\n", "env: fa_prefix=Pgenerosa_v074\n", "env: gff_prefix=Panopea-generosa-vv0.74\n" ] } ], "source": [ "# Set workding directory\n", "%env wd=/home/sam/analyses/20191105_swoose_pgen_v074_renaming\n", "wd=\"/home/sam/analyses/20191105_swoose_pgen_v074_renaming\"\n", "\n", "%env rsync_owl=owl:/volume1/web/halfshell/genomic-databank/\n", "%env wget_command=--directory-prefix=${wd} --quiety --no-directories --no-check-certificate https://owl.fish.washington.edu/halfshell/genomic-databank/\n", "\n", "%env new_prefix=Panopea-generosa-v1.0\n", "%env fa_prefix=Pgenerosa_v074\n", "%env gff_prefix=Panopea-generosa-vv0.74" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create necessary directories" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true } }, "outputs": [], "source": [ "%%bash\n", "mkdir --parents ${wd}" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/sam/analyses/20191105_swoose_pgen_v074_renaming\n" ] } ], "source": [ "cd {wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Download Pgen_v074 files\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.CDS.gff3\n", " 62,862,005 100% 27.44MB/s 0:00:02 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 62,869,800 bytes 9,672,281.54 bytes/sec\n", "total size is 62,862,005 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.exon.gff3\n", " 64,663,603 100% 26.86MB/s 0:00:02 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 64,671,619 bytes 9,949,484.46 bytes/sec\n", "total size is 64,663,603 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.gene.gff3\n", " 10,997,681 100% 25.90MB/s 0:00:00 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 10,999,145 bytes 3,142,621.43 bytes/sec\n", "total size is 10,997,681 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.mRNA.gff3\n", " 10,666,540 100% 22.46MB/s 0:00:00 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 10,667,964 bytes 7,111,996.00 bytes/sec\n", "total size is 10,666,540 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.repeat_region.gff3\n", " 390,130,212 100% 27.11MB/s 0:00:13 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 390,177,965 bytes 21,090,702.43 bytes/sec\n", "total size is 390,130,212 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.rRNA.gff3\n", " 1,447 100% 1.38MB/s 0:00:00 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 1,570 bytes 640.00 bytes/sec\n", "total size is 1,447 speedup is 0.90\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.tRNA.gff3\n", " 2,836,671 100% 29.41MB/s 0:00:00 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 2,837,138 bytes 1,134,867.20 bytes/sec\n", "total size is 2,836,671 speedup is 1.00\n", "receiving incremental file list\n", "Pgenerosa_v074.fa\n", " 958,059,901 100% 22.77MB/s 0:00:40 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 958,176,954 bytes 23,088,602.02 bytes/sec\n", "total size is 958,059,901 speedup is 1.00\n", "receiving incremental file list\n", "Pgenerosa_v074.fa.fai\n", " 1,230 100% 1.17MB/s 0:00:00 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 1,338 bytes 547.20 bytes/sec\n", "total size is 1,230 speedup is 0.90\n", "receiving incremental file list\n", "Pgenerosa_v074.CpG.gff\n", " 2,623,644,046 100% 27.02MB/s 0:01:32 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 2,623,964,425 bytes 28,063,790.96 bytes/sec\n", "total size is 2,623,644,046 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.intergenic.bed\n", " 2,113,935 100% 28.80MB/s 0:00:00 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 2,114,319 bytes 1,409,566.00 bytes/sec\n", "total size is 2,113,935 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.introns.bed\n", " 9,488,151 100% 30.26MB/s 0:00:00 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 9,489,433 bytes 6,326,308.67 bytes/sec\n", "total size is 9,488,151 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.repeats.DNA.gff3\n", " 5,560,657 100% 29.96MB/s 0:00:00 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 5,561,463 bytes 3,707,662.00 bytes/sec\n", "total size is 5,560,657 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.repeats.LTR.gff3\n", " 752,111 100% 32.60MB/s 0:00:00 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 752,329 bytes 501,572.67 bytes/sec\n", "total size is 752,111 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.repeats.SINE.gff3\n", " 10,492,279 100% 29.78MB/s 0:00:00 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 10,493,691 bytes 6,995,814.00 bytes/sec\n", "total size is 10,492,279 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.repeats.Unknown.gff3\n", " 350,419,849 100% 29.33MB/s 0:00:11 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 350,462,756 bytes 28,037,022.88 bytes/sec\n", "total size is 350,419,849 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.repeats.LINE.gff3\n", " 18,049,492 100% 13.58MB/s 0:00:01 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 18,051,824 bytes 3,282,155.27 bytes/sec\n", "total size is 18,049,492 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.repeats.RC.gff3\n", " 143,208 100% 45.52MB/s 0:00:00 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 143,353 bytes 95,588.67 bytes/sec\n", "total size is 143,208 speedup is 1.00\n", "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4.repeats.Simple_repeat.gff3\n", " 4,716,710 100% 8.63MB/s 0:00:00 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 4,717,422 bytes 3,144,968.00 bytes/sec\n", "total size is 4,716,710 speedup is 1.00\n", "total 4.3G\n", "-rwxr--r-- 1 sam users 60M Oct 14 10:13 Panopea-generosa-vv0.74.a4.CDS.gff3\n", "-rwxr--r-- 1 sam users 62M Oct 14 10:13 Panopea-generosa-vv0.74.a4.exon.gff3\n", "-rwxr--r-- 1 sam users 11M Oct 14 10:13 Panopea-generosa-vv0.74.a4.gene.gff3\n", "-rw-rw-r-- 1 sam users 2.1M Oct 30 08:41 Panopea-generosa-vv0.74.a4.intergenic.bed\n", "-rw-rw-r-- 1 sam users 9.1M Oct 30 08:41 Panopea-generosa-vv0.74.a4.introns.bed\n", "-rwxr--r-- 1 sam users 11M Oct 14 10:13 Panopea-generosa-vv0.74.a4.mRNA.gff3\n", "-rwxr--r-- 1 sam users 373M Oct 14 10:13 Panopea-generosa-vv0.74.a4.repeat_region.gff3\n", "-rw-rw-r-- 1 sam users 5.4M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.DNA.gff3\n", "-rw-rw-r-- 1 sam users 18M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.LINE.gff3\n", "-rw-rw-r-- 1 sam users 735K Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.LTR.gff3\n", "-rw-rw-r-- 1 sam users 140K Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.RC.gff3\n", "-rw-rw-r-- 1 sam users 4.5M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.Simple_repeat.gff3\n", "-rw-rw-r-- 1 sam users 11M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.SINE.gff3\n", "-rw-rw-r-- 1 sam users 335M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.Unknown.gff3\n", "-rwxr--r-- 1 sam users 1.5K Oct 14 10:13 Panopea-generosa-vv0.74.a4.rRNA.gff3\n", "-rwxr--r-- 1 sam users 2.8M Oct 14 10:13 Panopea-generosa-vv0.74.a4.tRNA.gff3\n", "-rw-rw-r-- 1 sam users 2.5G Aug 21 11:45 Pgenerosa_v074.CpG.gff\n", "-rw-rw-rw- 1 sam users 914M Jun 26 08:49 Pgenerosa_v074.fa\n", "-rw-rw-rw- 1 sam users 1.3K Jun 26 08:54 Pgenerosa_v074.fa.fai\n" ] } ], "source": [ "%%bash\n", "# Create array of files from list\n", "files_array=(Panopea-generosa-vv0.74.a4.CDS.gff3 Panopea-generosa-vv0.74.a4.exon.gff3 Panopea-generosa-vv0.74.a4.gene.gff3 Panopea-generosa-vv0.74.a4.mRNA.gff3 Panopea-generosa-vv0.74.a4.repeat_region.gff3 Panopea-generosa-vv0.74.a4.rRNA.gff3 Panopea-generosa-vv0.74.a4.tRNA.gff3 Pgenerosa_v074.fa Pgenerosa_v074.fa.fai Pgenerosa_v074.CpG.gff Panopea-generosa-vv0.74.a4.intergenic.bed Panopea-generosa-vv0.74.a4.introns.bed Panopea-generosa-vv0.74.a4.repeats.DNA.gff3 Panopea-generosa-vv0.74.a4.repeats.LTR.gff3 Panopea-generosa-vv0.74.a4.repeats.SINE.gff3 Panopea-generosa-vv0.74.a4.repeats.Unknown.gff3 Panopea-generosa-vv0.74.a4.repeats.LINE.gff3 Panopea-generosa-vv0.74.a4.repeats.RC.gff3 Panopea-generosa-vv0.74.a4.repeats.Simple_repeat.gff3)\n", "\n", "\n", "for file in \"${files_array[@]}\"\n", "do\n", " rsync \\\n", " --archive \\\n", " --progress \\\n", " --verbose \\\n", " \"${rsync_owl}${file}\" \\\n", " .\n", "done\n", "\n", "ls -lh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### If need to download via wget, uncomment lines in the cell below" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# %%bash\n", "# time \\\n", "# wget \"${wget_gffs}\"\n", "# wget \"${wget_fasta}\"\n", "# ls -lh ${wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Change scaffold names and file names" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 7.9G\n", "-rw-rw-rw- 1 sam users 914M Jun 26 08:49 Pgenerosa_v074.fa\n", "-rw-rw-rw- 1 sam users 1.3K Jun 26 08:54 Pgenerosa_v074.fa.fai\n", "-rw-rw-r-- 1 sam users 2.5G Aug 21 11:45 Pgenerosa_v074.CpG.gff\n", "-rwxr--r-- 1 sam users 60M Oct 14 10:13 Panopea-generosa-vv0.74.a4.CDS.gff3\n", "-rwxr--r-- 1 sam users 62M Oct 14 10:13 Panopea-generosa-vv0.74.a4.exon.gff3\n", "-rwxr--r-- 1 sam users 11M Oct 14 10:13 Panopea-generosa-vv0.74.a4.gene.gff3\n", "-rwxr--r-- 1 sam users 11M Oct 14 10:13 Panopea-generosa-vv0.74.a4.mRNA.gff3\n", "-rwxr--r-- 1 sam users 373M Oct 14 10:13 Panopea-generosa-vv0.74.a4.repeat_region.gff3\n", "-rwxr--r-- 1 sam users 1.5K Oct 14 10:13 Panopea-generosa-vv0.74.a4.rRNA.gff3\n", "-rwxr--r-- 1 sam users 2.8M Oct 14 10:13 Panopea-generosa-vv0.74.a4.tRNA.gff3\n", "-rw-rw-r-- 1 sam users 11M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.SINE.gff3\n", "-rw-rw-r-- 1 sam users 4.5M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.Simple_repeat.gff3\n", "-rw-rw-r-- 1 sam users 140K Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.RC.gff3\n", "-rw-rw-r-- 1 sam users 735K Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.LTR.gff3\n", "-rw-rw-r-- 1 sam users 18M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.LINE.gff3\n", "-rw-rw-r-- 1 sam users 5.4M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.DNA.gff3\n", "-rw-rw-r-- 1 sam users 335M Oct 29 08:52 Panopea-generosa-vv0.74.a4.repeats.Unknown.gff3\n", "-rw-rw-r-- 1 sam users 2.1M Oct 30 08:41 Panopea-generosa-vv0.74.a4.intergenic.bed\n", "-rw-rw-r-- 1 sam users 9.1M Oct 30 08:41 Panopea-generosa-vv0.74.a4.introns.bed\n", "-rw-rw-r-- 1 sam users 53M Nov 5 08:55 Panopea-generosa-v1.0.a4.CDS.gff3\n", "-rw-rw-r-- 1 sam users 55M Nov 5 08:55 Panopea-generosa-v1.0.a4.exon.gff3\n", "-rw-rw-r-- 1 sam users 9.5M Nov 5 08:55 Panopea-generosa-v1.0.a4.gene.gff3\n", "-rw-rw-r-- 1 sam users 9.1M Nov 5 08:55 Panopea-generosa-v1.0.a4.mRNA.gff3\n", "-rw-rw-r-- 1 sam users 326M Nov 5 08:55 Panopea-generosa-v1.0.a4.repeat_region.gff3\n", "-rw-rw-r-- 1 sam users 1.2K Nov 5 08:55 Panopea-generosa-v1.0.a4.rRNA.gff3\n", "-rw-rw-r-- 1 sam users 2.3M Nov 5 08:55 Panopea-generosa-v1.0.a4.tRNA.gff3\n", "-rw-rw-r-- 1 sam users 914M Nov 5 08:56 Panopea-generosa-v1.0.fa\n", "-rw-rw-r-- 1 sam users 658 Nov 5 08:56 Panopea-generosa-v1.0.fa.fai\n", "-rw-rw-r-- 1 sam users 2.0G Nov 5 08:59 Panopea-generosa-v1.0.CpG.gff\n", "-rw-rw-r-- 1 sam users 996K Nov 5 08:59 Panopea-generosa-v1.0.a4.intergenic.bed\n", "-rw-rw-r-- 1 sam users 4.4M Nov 5 08:59 Panopea-generosa-v1.0.a4.introns.bed\n", "-rw-rw-r-- 1 sam users 4.7M Nov 5 08:59 Panopea-generosa-v1.0.a4.repeats.DNA.gff3\n", "-rw-rw-r-- 1 sam users 645K Nov 5 08:59 Panopea-generosa-v1.0.a4.repeats.LTR.gff3\n", "-rw-rw-r-- 1 sam users 8.9M Nov 5 08:59 Panopea-generosa-v1.0.a4.repeats.SINE.gff3\n", "-rw-rw-r-- 1 sam users 293M Nov 5 08:59 Panopea-generosa-v1.0.a4.repeats.Unknown.gff3\n", "-rw-rw-r-- 1 sam users 16M Nov 5 08:59 Panopea-generosa-v1.0.a4.repeats.LINE.gff3\n", "-rw-rw-r-- 1 sam users 123K Nov 5 08:59 Panopea-generosa-v1.0.a4.repeats.RC.gff3\n", "-rw-rw-r-- 1 sam users 4.0M Nov 5 08:59 Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3\n" ] } ], "source": [ "%%bash\n", "\n", "files_array=(Panopea-generosa-vv0.74.a4.CDS.gff3 Panopea-generosa-vv0.74.a4.exon.gff3 Panopea-generosa-vv0.74.a4.gene.gff3 Panopea-generosa-vv0.74.a4.mRNA.gff3 Panopea-generosa-vv0.74.a4.repeat_region.gff3 Panopea-generosa-vv0.74.a4.rRNA.gff3 Panopea-generosa-vv0.74.a4.tRNA.gff3 Pgenerosa_v074.fa Pgenerosa_v074.fa.fai Pgenerosa_v074.CpG.gff Panopea-generosa-vv0.74.a4.intergenic.bed Panopea-generosa-vv0.74.a4.introns.bed Panopea-generosa-vv0.74.a4.repeats.DNA.gff3 Panopea-generosa-vv0.74.a4.repeats.LTR.gff3 Panopea-generosa-vv0.74.a4.repeats.SINE.gff3 Panopea-generosa-vv0.74.a4.repeats.Unknown.gff3 Panopea-generosa-vv0.74.a4.repeats.LINE.gff3 Panopea-generosa-vv0.74.a4.repeats.RC.gff3 Panopea-generosa-vv0.74.a4.repeats.Simple_repeat.gff3)\n", "\n", "# Array of old scaffold names\n", "# Uses first column of FastA index file to get scaffold names\n", "mapfile -t orig_scaffold_names < <(awk '{print $1}' Pgenerosa_v074.fa.fai)\n", "\n", "# Array of new scaffold names\n", "mapfile -t new_scaffold_names < <(for number in {01..18}; do echo \"Scaffold_${number}\"; done)\n", "\n", "\n", "for file in \"${files_array[@]}\"\n", "do\n", " # Set new filename, depending if FastA or GFF\n", " name_check=\"${file%%.*}\"\n", " if [ \"${name_check}\" = \"${fa_prefix}\" ]\n", " then\n", " new_name=${file/$fa_prefix/$new_prefix}\n", " else\n", " new_name=${file/$gff_prefix/$new_prefix}\n", " fi\n", " # sed substitution\n", " # creates sed script to find original scaffold names and replace them with new scafold names\n", " # and passes to sed via stdin\n", " for index in \"${!orig_scaffold_names[@]}\"\n", " do\n", " printf \"s/%s/%s/\\n\" \"${orig_scaffold_names[index]}\" \"${new_scaffold_names[index]}\"\n", " done | sed --file - \"${file}\" \\\n", " >> \"${new_name}\"\n", "done\n", "\n", "ls -ltrh" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t2\t125\t.\t+\t0\tID=PGEN_.00g000010.m01.CDS01;Name=PGEN_.00g000010.m01.CDS01;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t1995\t2095\t.\t+\t1\tID=PGEN_.00g000010.m01.CDS02;Name=PGEN_.00g000010.m01.CDS02;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t3325\t3495\t.\t+\t0\tID=PGEN_.00g000010.m01.CDS03;Name=PGEN_.00g000010.m01.CDS03;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t4651\t4719\t.\t+\t0\tID=PGEN_.00g000010.m01.CDS04;Name=PGEN_.00g000010.m01.CDS04;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t19808\t19943\t.\t-\t2\tID=PGEN_.00g000020.m01.CDS01;Name=PGEN_.00g000020.m01.CDS01;Parent=PGEN_.00g000020.m01;original_ID=cds.21510-PGEN_.00g234150.m01;Alias=cds.21510-PGEN_.00g234150.m01\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t21133\t21362\t.\t-\t0\tID=PGEN_.00g000020.m01.CDS02;Name=PGEN_.00g000020.m01.CDS02;Parent=PGEN_.00g000020.m01;original_ID=cds.21510-PGEN_.00g234150.m01;Alias=cds.21510-PGEN_.00g234150.m01\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t22487\t22613\t.\t-\t2\tID=PGEN_.00g000020.m01.CDS03;Name=PGEN_.00g000020.m01.CDS03;Parent=PGEN_.00g000020.m01;original_ID=cds.21510-PGEN_.00g234150.m01;Alias=cds.21510-PGEN_.00g234150.m01\n", "\n", "--------------------\n", "##Generated\n", "##gff-version\n", "##Project\n", "Scaffold_01\n", "Scaffold_02\n", "Scaffold_03\n", "Scaffold_04\n", "Scaffold_05\n", "Scaffold_06\n", "Scaffold_07\n", "Scaffold_08\n", "Scaffold_09\n", "Scaffold_10\n", "Scaffold_11\n", "Scaffold_12\n", "Scaffold_13\n", "Scaffold_14\n", "Scaffold_15\n", "Scaffold_16\n", "Scaffold_17\n", "Scaffold_18\n" ] } ], "source": [ "%%bash\n", "head Panopea-generosa-v1.0.a4.CDS.gff3\n", "\n", "echo \"\"\n", "echo \"--------------------\"\n", "awk '{print $1}' Panopea-generosa-v1.0.a4.CDS.gff3 | sort | uniq" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 3.7G\n", "-rw-rw-r-- 1 sam users 53M Nov 5 08:55 Panopea-generosa-v1.0.a4.CDS.gff3\n", "-rw-rw-r-- 1 sam users 55M Nov 5 08:55 Panopea-generosa-v1.0.a4.exon.gff3\n", "-rw-rw-r-- 1 sam users 9.5M Nov 5 08:55 Panopea-generosa-v1.0.a4.gene.gff3\n", "-rw-rw-r-- 1 sam users 9.1M Nov 5 08:55 Panopea-generosa-v1.0.a4.mRNA.gff3\n", "-rw-rw-r-- 1 sam users 326M Nov 5 08:55 Panopea-generosa-v1.0.a4.repeat_region.gff3\n", "-rw-rw-r-- 1 sam users 1.2K Nov 5 08:55 Panopea-generosa-v1.0.a4.rRNA.gff3\n", "-rw-rw-r-- 1 sam users 2.3M Nov 5 08:55 Panopea-generosa-v1.0.a4.tRNA.gff3\n", "-rw-rw-r-- 1 sam users 914M Nov 5 08:56 Panopea-generosa-v1.0.fa\n", "-rw-rw-r-- 1 sam users 658 Nov 5 08:56 Panopea-generosa-v1.0.fa.fai\n", "-rw-rw-r-- 1 sam users 2.0G Nov 5 08:59 Panopea-generosa-v1.0.CpG.gff\n", "-rw-rw-r-- 1 sam users 996K Nov 5 08:59 Panopea-generosa-v1.0.a4.intergenic.bed\n", "-rw-rw-r-- 1 sam users 4.4M Nov 5 08:59 Panopea-generosa-v1.0.a4.introns.bed\n", "-rw-rw-r-- 1 sam users 4.7M Nov 5 08:59 Panopea-generosa-v1.0.a4.repeats.DNA.gff3\n", "-rw-rw-r-- 1 sam users 645K Nov 5 08:59 Panopea-generosa-v1.0.a4.repeats.LTR.gff3\n", "-rw-rw-r-- 1 sam users 8.9M Nov 5 08:59 Panopea-generosa-v1.0.a4.repeats.SINE.gff3\n", "-rw-rw-r-- 1 sam users 293M Nov 5 08:59 Panopea-generosa-v1.0.a4.repeats.Unknown.gff3\n", "-rw-rw-r-- 1 sam users 16M Nov 5 08:59 Panopea-generosa-v1.0.a4.repeats.LINE.gff3\n", "-rw-rw-r-- 1 sam users 123K Nov 5 08:59 Panopea-generosa-v1.0.a4.repeats.RC.gff3\n", "-rw-rw-r-- 1 sam users 4.0M Nov 5 08:59 Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3\n" ] } ], "source": [ "%%bash\n", "files_array=(Panopea-generosa-vv0.74.a4.CDS.gff3 Panopea-generosa-vv0.74.a4.exon.gff3 Panopea-generosa-vv0.74.a4.gene.gff3 Panopea-generosa-vv0.74.a4.mRNA.gff3 Panopea-generosa-vv0.74.a4.repeat_region.gff3 Panopea-generosa-vv0.74.a4.rRNA.gff3 Panopea-generosa-vv0.74.a4.tRNA.gff3 Pgenerosa_v074.fa Pgenerosa_v074.fa.fai Pgenerosa_v074.CpG.gff Panopea-generosa-vv0.74.a4.intergenic.bed Panopea-generosa-vv0.74.a4.introns.bed Panopea-generosa-vv0.74.a4.repeats.DNA.gff3 Panopea-generosa-vv0.74.a4.repeats.LTR.gff3 Panopea-generosa-vv0.74.a4.repeats.SINE.gff3 Panopea-generosa-vv0.74.a4.repeats.Unknown.gff3 Panopea-generosa-vv0.74.a4.repeats.LINE.gff3 Panopea-generosa-vv0.74.a4.repeats.RC.gff3 Panopea-generosa-vv0.74.a4.repeats.Simple_repeat.gff3)\n", "\n", "for file in \"${files_array[@]}\"\n", "do\n", " rm \"${file}\"\n", "done\n", "\n", "ls -ltrh" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 4 }