{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Mon Feb 25 14:55:58 PST 2019\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.5 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "swoose\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 24\n", "On-line CPU(s) list: 0-23\n", "Thread(s) per core: 2\n", "Core(s) per socket: 6\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 44\n", "Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz\n", "Stepping: 2\n", "CPU MHz: 2925.907\n", "BogoMIPS: 5851.96\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 12288K\n", "NUMA node0 CPU(s): 0-23\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 70G 6.9G 1.6G 1.1G 62G 62G\n", "Swap: 4.7G 825M 3.8G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Set variables" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: seqkit=/home/sam/programs/seqkit-v0.9.3\n", "env: data_dir=/home/sam/data\n", "env: analyses_dir=/home/sam/analyses/20190225_cpg_oe\n" ] } ], "source": [ "%env seqkit=/home/sam/programs/seqkit-v0.9.3\n", "%env data_dir=/home/sam/data\n", "%env analyses_dir=/home/sam/analyses/20190225_cpg_oe" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Download data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-rw-rw-r-- 1 sam sam 409M Jan 9 13:06 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_1_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:07 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_2_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:07 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_3_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:07 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_4_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:07 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_5_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:08 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_6_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:08 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_1_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:08 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_2_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:09 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_3_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:09 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_4_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:09 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_5_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:10 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_6_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:10 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_1_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:10 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_2_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:11 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_3_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:11 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_5_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:11 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_6_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:11 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_7_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:12 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_1_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:12 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_2_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:12 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_3_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:13 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_4_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:13 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_5_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:13 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_6_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:14 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_1_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:14 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_3_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:14 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_4_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:15 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_5_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:15 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_6_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:15 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_7_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:15 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_1_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:16 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_2_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:16 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_3_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:16 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_4_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:17 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_5_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:17 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_6_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:17 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HG_HG0F2_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:18 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HG_HG2F1_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:18 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HG_HG2M5_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:18 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_1_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:18 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_2_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:19 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_3_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:19 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_4_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:19 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_5_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:20 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_6_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:20 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LM_1_pool_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:20 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LM_3_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:21 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LM_4_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:21 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LM_7_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:21 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LM_8_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:22 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_1_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:22 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_2_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:22 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_3_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:22 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_4_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:23 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_5_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:23 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_6_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:23 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_1_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:24 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_2_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:24 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_3_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:24 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_4_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:25 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_5_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:25 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_6_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:25 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NG_NH0H4_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:26 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NG_NH2F6_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:26 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NG_NH2F8_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:26 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NG_NH2M1_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:27 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_1_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:27 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_2_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:27 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_3_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:27 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_4_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:28 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_5_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:28 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_6_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:28 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_1_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:29 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_2_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:29 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_3_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:29 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_4_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:30 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_5_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:30 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_6_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:30 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_10_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:31 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_11_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:31 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_12_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:31 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_7_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:32 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_8_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:32 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_9_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:32 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_1_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:32 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_2_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:33 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_3_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:33 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_4_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:33 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_5_GENE.fa\n", "-rw-rw-r-- 1 sam sam 409M Jan 9 13:34 Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_6_GENE.fa\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "real\t6m32.190s\n", "user\t0m22.332s\n", "sys\t2m0.584s\n" ] } ], "source": [ "%%bash\n", "cd ${data_dir}\n", "time \\\n", "wget \\\n", "--recursive \\\n", "--no-directories \\\n", "--no-parent \\\n", "--quiet \\\n", "--accept \"*GENE.fa\" \\\n", "\"http://gannet.fish.washington.edu/seashell/bu-serine-wd/19-01-08/\"\n", "\n", "ls -lhtr *.fa" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Prepared at request of this GitHub Issue: \n", "\n", "https://github.com/RobertsLab/resources/issues/593\n", "\n", "Code is modified from the following link in order to loop through a large number of files:\n", "\n", "http://htmlpreview.github.io/?https://github.com/hputnam/EastOyEpi/blob/master/02-Cpg-test.html" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Process is interrupted.\n" ] } ], "source": [ "%%bash\n", "fa_array=(${data_dir}/*GENE.fa)\n", "\n", "for fa in ${fa_array[@]}\n", "do\n", " fn=$(basename ${fa} .fa)\n", " mkdir ${fn}_analysis\n", " cd ${fn}_analysis\n", " fx2tab \\\n", " --length \\\n", " ${fa} \\\n", " > ${fa}_tab\n", " awk '{ print $2 }' ${fa}_tab > ${fa}_tab2\n", " awk -F\\[Cc][Gg] '{print NF-1}' ${fa}_tab_2 > CG\n", " awk -F\\[Cc] '{print NF-1}' ${fa}_tab_2 > C\n", " awk -F\\[Gg] '{print NF-1}' ${fa}_tab_2 > G\n", " paste ${fa} \\\n", " CG \\\n", " C \\\n", " G \\\n", " > comb\n", " awk '{print $1, \"\\t\", (($4)/($5*$6))*(($3^2)/($3-1))}' comb \\\n", " > ID_CpG\n", "done" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Process is interrupted.\n" ] } ], "source": [ "%%bash\n", "mkdir ${analyses_dir}\n", "\n", "fa_array=(${data_dir}/*GENE.fa)\n", "\n", "for fa in \"${fa_array[@]}\"\n", "do\n", " cd ${analyses_dir}\n", " fn=$(basename ${fa} .fa)\n", " mkdir ${fn}_analysis\n", " cd ${fn}_analysis\n", " ${seqkit} fx2tab \\\n", " --length \\\n", " ${fa} \\\n", " > ${analyses_dir}/${fa}_tab\n", " awk '{ print $2 }' ${fa}_tab > ${fa}_tab2\n", " awk -F\\[Cc][Gg] '{print NF-1}' ${fa}_tab_2 > CG\n", " awk -F\\[Cc] '{print NF-1}' ${fa}_tab_2 > C\n", " awk -F\\[Gg] '{print NF-1}' ${fa}_tab_2 > G\n", " paste ${fa} \\\n", " CG \\\n", " C \\\n", " G \\\n", " > comb\n", " awk '{print $1, \"\\t\", (($4)/($5*$6))*(($3^2)/($3-1))}' comb \\\n", " > ID_CpG\n", "done" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Process is terminated.\n" ] } ], "source": [ "%%bash\n", "mkdir ${analyses_dir}\n", "\n", "fa_array=(${data_dir}/*GENE.fa)\n", "\n", "for fa in \"${fa_array[@]}\"\n", "do\n", " cd ${analyses_dir}\n", " fn=$(basename ${fa} .fa)\n", " mkdir ${fn}_analysis\n", " cd ${fn}_analysis\n", " ${seqkit} fx2tab \\\n", " --length \\\n", " ${fa} \\\n", " > ${fn}_analysis/${fn}_tab\n", " awk '{ print $2 }' ${fn}_tab > ${fn}_tab2\n", " awk -F\\[Cc][Gg] '{print NF-1}' ${fn}_tab_2 > CG\n", " awk -F\\[Cc] '{print NF-1}' ${fn}_tab_2 > C\n", " awk -F\\[Gg] '{print NF-1}' ${fn}_tab_2 > G\n", " paste ${fa} \\\n", " CG \\\n", " C \\\n", " G \\\n", " > comb\n", " awk '{print $1, \"\\t\", (($4)/($5*$6))*(($3^2)/($3-1))}' comb \\\n", " > ID_CpG\n", "done" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SeqKit -- a cross-platform and ultrafast toolkit for FASTA/Q file manipulation\n", "\n", "Version: 0.9.3\n", "\n", "Author: Wei Shen \n", "\n", "Documents : http://bioinf.shenwei.me/seqkit\n", "Source code: https://github.com/shenwei356/seqkit\n", "Please cite: https://doi.org/10.1371/journal.pone.0163962\n", "\n", "Usage:\n", " seqkit [command]\n", "\n", "Available Commands:\n", " common find common sequences of multiple files by id/name/sequence\n", " concat concatenate sequences with same ID from multiple files\n", " convert convert FASTQ quality encoding between Sanger, Solexa and Illumina\n", " duplicate duplicate sequences N times\n", " faidx create FASTA index file and extract subsequence\n", " fq2fa convert FASTQ to FASTA\n", " fx2tab convert FASTA/Q to tabular format (with length/GC content/GC skew)\n", " genautocomplete generate shell autocompletion script\n", " grep search sequences by ID/name/sequence/sequence motifs, mismatch allowed\n", " head print first N FASTA/Q records\n", " help Help about any command\n", " locate locate subsequences/motifs, mismatch allowed\n", " range print FASTA/Q records in a range (start:end)\n", " rename rename duplicated IDs\n", " replace replace name/sequence by regular expression\n", " restart reset start position for circular genome\n", " rmdup remove duplicated sequences by id/name/sequence\n", " sample sample sequences by number or proportion\n", " seq transform sequences (revserse, complement, extract ID...)\n", " shuffle shuffle sequences\n", " sliding sliding sequences, circular genome supported\n", " sort sort sequences by id/name/sequence/length\n", " split split sequences into files by id/seq region/size/parts (mainly for FASTA)\n", " split2 split sequences into files by size/parts (FASTA, PE/SE FASTQ)\n", " stats simple statistics of FASTA/Q files\n", " subseq get subsequences by region/gtf/bed, including flanking sequences\n", " tab2fx convert tabular format to FASTA/Q format\n", " translate translate DNA/RNA to protein sequence\n", " version print version information and check for update\n", "\n", "Flags:\n", " --alphabet-guess-seq-length int length of sequence prefix of the first FASTA record based on which seqkit guesses the sequence type (0 for whole seq) (default 10000)\n", " -h, --help help for seqkit\n", " --id-ncbi FASTA head is NCBI-style, e.g. >gi|110645304|ref|NC_002516.2| Pseud...\n", " --id-regexp string regular expression for parsing ID (default \"^([^\\\\s]+)\\\\s?\")\n", " -w, --line-width int line width when outputing FASTA format (0 for no wrap) (default 60)\n", " -o, --out-file string out file (\"-\" for stdout, suffix .gz for gzipped out) (default \"-\")\n", " --quiet be quiet and do not show extra information\n", " -t, --seq-type string sequence type (dna|rna|protein|unlimit|auto) (for auto, it automatically detect by the first sequence) (default \"auto\")\n", " -j, --threads int number of CPUs. (default value: 1 for single-CPU PC, 2 for others) (default 2)\n", "\n", "Use \"seqkit [command] --help\" for more information about a command.\n" ] } ], "source": [ "%%bash\n", "${seqkit}" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CL_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CLP_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_7_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.CS_7_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.DEBY_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_7_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_7_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HC_VA_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HG_HG0F2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HG_HG0F2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HG_HG2F1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HG_HG2F1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HG_HG2M5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HG_HG2M5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.HI_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LM_1_pool_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LM_1_pool_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LM_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LM_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LM_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LM_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LM_7_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LM_7_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LM_8_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LM_8_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.LOLA_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NEH_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NG_NH0H4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NG_NH0H4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NG_NH2F6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NG_NH2F6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NG_NH2F8_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NG_NH2F8_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NG_NH2M1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.NG_NH2M1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.OBOYS2_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SL_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_10_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_10_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_11_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_11_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_12_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_12_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_7_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_7_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_8_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_8_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_9_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.SM_9_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_1_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_2_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_3_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_4_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_5_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_6_GENE_tab2\" FNR=314 NR=314\n", "awk: program limit exceeded: maximum number of fields size=32767\n", "\tFILENAME=\"Combined.SNP.TRSdp5g95FnDNAmaf05.sorted.ANACfill.UMFS_6_GENE_tab2\" FNR=314 NR=314\n" ] } ], "source": [ "%%bash\n", "mkdir ${analyses_dir}\n", "\n", "fa_array=(${data_dir}/*GENE.fa)\n", "\n", "for fa in \"${fa_array[@]}\"\n", "do\n", " cd ${analyses_dir}\n", " fn=$(basename ${fa} .fa)\n", " mkdir ${fn}_analysis\n", " \n", " cd ${fn}_analysis\n", " \n", " ${seqkit} fx2tab \\\n", " --length \\\n", " ${fa} \\\n", " > ${fn}_tab\n", " \n", " awk '{ print $2 }' ${fn}_tab > ${fn}_tab2\n", " \n", " awk -F\\[Cc][Gg] '{print NF-1}' ${fn}_tab2 > CG\n", " \n", " awk -F\\[Cc] '{print NF-1}' ${fn}_tab2 > C\n", " \n", " awk -F\\[Gg] '{print NF-1}' ${fn}_tab2 > G\n", " \n", " paste ${fn}_tab \\\n", " CG \\\n", " C \\\n", " G \\\n", " > comb\n", " \n", " awk '{print $1, \"\\t\", (($4)/($5*$6))*(($3^2)/($3-1))}' comb \\\n", " > ID_CpG\n", "done" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Ugh. Turns out awk has some field limit. Solution: install and use ```gawk```.\n", "\n", "Installed ```gawk``` and restarted notebook." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n", "real\t65m31.145s\n", "user\t53m47.100s\n", "sys\t2m53.228s\n" ] } ], "source": [ "%%bash\n", "mkdir ${analyses_dir}\n", "\n", "# Create arrays of all FastA files\n", "fa_array=(${data_dir}/*GENE.fa)\n", "\n", "\n", "time \\\n", "for fa in \"${fa_array[@]}\"\n", "do\n", " # Change to proper directory\n", " cd ${analyses_dir}\n", " # Remove file path and extension from the FastA and save as variable\n", " fn=$(basename ${fa} .fa)\n", " \n", " # Make subdirectory using filename\n", " mkdir ${fn}_analysis\n", " cd ${fn}_analysis\n", " \n", " # Use seqkit to convert FastA to tab-delimited and print sequence length \n", " ${seqkit} fx2tab \\\n", " --length \\\n", " ${fa} \\\n", " > ${fn}_tab\n", " \n", " # Print only sequences to new file\n", " gawk '{ print $2 }' ${fn}_tab > ${fn}_tab2\n", " \n", " # Delimit sequences on CGs and print the number of fields minus 1 to get the number of CGs present.\n", " gawk -F\\[Cc][Gg] '{print NF-1}' ${fn}_tab2 > CG\n", " \n", " # Delimit sequences on CGs and print the number of fields minus 1 to get the number of Cs present.\n", " gawk -F\\[Cc] '{print NF-1}' ${fn}_tab2 > C\n", " \n", " # Delimit sequences on CGs and print the number of fields minus 1 to get the number of Gs present.\n", " gawk -F\\[Gg] '{print NF-1}' ${fn}_tab2 > G\n", " \n", " # Paste these together to have file with the following fields:\n", " # - FastA header\n", " # - Sequence\n", " # - Sequence length\n", " # - Number of CGs\n", " # - Number of Cs\n", " # - Number of Gs\n", " paste ${fn}_tab \\\n", " CG \\\n", " C \\\n", " G \\\n", " > comb\n", " \n", " # Do some math to calculate CpG O/E ratio (observed vs expected)\n", " gawk '{print $1, \"\\t\", (($4)/($5*$6))*(($3^2)/($3-1))}' comb \\\n", " > ID_CpG\n", "done" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "cd /home/sam/analyses/\n", "\n", "rsync \\\n", "--archive \\\n", "--relative \\\n", "./20190225_cpg_oe \\\n", "gannet:/volume1/web/Atumefaciens" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" } }, "nbformat": 4, "nbformat_minor": 2 }