{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Wed Nov 6 14:57:08 PST 2019\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.6 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "swoose\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 24\n", "On-line CPU(s) list: 0-23\n", "Thread(s) per core: 2\n", "Core(s) per socket: 6\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 44\n", "Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz\n", "Stepping: 2\n", "CPU MHz: 2925.971\n", "BogoMIPS: 5851.97\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 12288K\n", "NUMA node0 CPU(s): 0-23\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 70G 31G 450M 523M 38G 38G\n", "Swap: 4.7G 879M 3.8G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set variables\n", "\n", "`%env` are best for bash" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: wd=/home/sam/analyses/20191106_swoose_pgen_feature_stats\n", "env: rsync_gannet=gannet:/volume2/web/Atumefaciens/20191105_swoose_pgen_v074_renaming/\n", "env: wget_command=--directory-prefix=$/home/sam/analyses/20191106_swoose_pgen_feature_stats --quiety --no-directories --no-check-certificate https://gannet.fish.washington.edu/Atumefaciens/20191105_swoose_pgen_v074_renaming/\n", "env: exon_gff=Panopea-generosa-v1.0.a4.exon.gff3\n", "env: exon_bed=Panopea-generosa-v1.0.a4.exon.bed\n", "env: gene_gff=Panopea-generosa-v1.0.a4.gene.gff3\n", "env: gene_bed=Panopea-generosa-v1.0.a4.gene.bed\n", "env: introns_bed=Panopea-generosa-v1.0.a4.introns.bed\n", "env: intergenic_bed=Panopea-generosa-v1.0.a4.intergenic.bed\n", "env: intersectbed=/home/sam/programs/bedtools-2.28.0/bin/intersectBed\n" ] } ], "source": [ "# Set workding directory\n", "%env wd=/home/sam/analyses/20191106_swoose_pgen_feature_stats\n", "wd=\"/home/sam/analyses/20191106_swoose_pgen_feature_stats\"\n", "\n", "# File download commands\n", "%env rsync_gannet=gannet:/volume2/web/Atumefaciens/20191105_swoose_pgen_v074_renaming/\n", "%env wget_command=--directory-prefix=${wd} --quiety --no-directories --no-check-certificate https://gannet.fish.washington.edu/Atumefaciens/20191105_swoose_pgen_v074_renaming/\n", "\n", "# Input/output files\n", "%env exon_gff=Panopea-generosa-v1.0.a4.exon.gff3\n", "%env exon_bed=Panopea-generosa-v1.0.a4.exon.bed\n", "%env gene_gff=Panopea-generosa-v1.0.a4.gene.gff3\n", "%env gene_bed=Panopea-generosa-v1.0.a4.gene.bed\n", "%env introns_bed=Panopea-generosa-v1.0.a4.introns.bed\n", "%env intergenic_bed=Panopea-generosa-v1.0.a4.intergenic.bed\n", "\n", "# Programs\n", "%env intersectbed=/home/sam/programs/bedtools-2.28.0/bin/intersectBed\n", "\n", "# Set list of column header names\n", "bed_header = ['scaffold', 'start', 'end']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import Python modules" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import fnmatch\n", "import os\n", "import pandas" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "mkdir --parents ${wd}" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/sam/analyses/20191106_swoose_pgen_feature_stats\n" ] } ], "source": [ "cd {wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Download files" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "receiving incremental file list\n", "\n", "sent 11 bytes received 79 bytes 180.00 bytes/sec\n", "total size is 57,148,140 speedup is 634,979.33\n", "receiving incremental file list\n", "Panopea-generosa-v1.0.a4.gene.gff3\n", " 9,888,459 100% 110.95MB/s 0:00:00 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 9,889,785 bytes 19,779,630.00 bytes/sec\n", "total size is 9,888,459 speedup is 1.00\n", "receiving incremental file list\n", "\n", "sent 11 bytes received 83 bytes 62.67 bytes/sec\n", "total size is 1,019,566 speedup is 10,846.45\n", "receiving incremental file list\n", "Panopea-generosa-v1.0.a4.introns.bed\n", " 4,579,012 100% 111.97MB/s 0:00:00 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 4,579,691 bytes 9,159,442.00 bytes/sec\n", "total size is 4,579,012 speedup is 1.00\n", "total 76M\n", "-rw-rw-r-- 1 sam sam 6.7M Nov 6 15:21 Panopea-generosa-v1.0.a4.exon.bed\n", "-rw-rw-r-- 1 sam users 55M Nov 5 08:55 Panopea-generosa-v1.0.a4.exon.gff3\n", "-rw-rw-r-- 1 sam users 9.5M Nov 5 08:55 Panopea-generosa-v1.0.a4.gene.gff3\n", "-rw-rw-r-- 1 sam users 996K Nov 5 08:59 Panopea-generosa-v1.0.a4.intergenic.bed\n", "-rw-rw-r-- 1 sam users 4.4M Nov 5 08:59 Panopea-generosa-v1.0.a4.introns.bed\n" ] } ], "source": [ "%%bash\n", "# Create array of files from list\n", "files_array=(${exon_gff} ${gene_gff} ${intergenic_bed} ${introns_bed})\n", "\n", "for file in \"${files_array[@]}\"\n", "do\n", " rsync \\\n", " --archive \\\n", " --progress \\\n", " --verbose \\\n", " \"${rsync_gannet}${file}\" \\\n", " .\n", "done\n", "\n", "ls -lh\n", "\n", "# If need to download via wget, uncomment lines in the cell below\n", "# for file in \"${files_array[@]}\"\n", "# do \n", "# \"${wget_command}${file}\"\n", "# done\n", "# ls -lh ${wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create gene and exon BED files\n", "\n", "Too lazy to write for loop" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Preview GFF: Panopea-generosa-v1.0.a4.exon.gff3\n", "\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t2\t125\t.\t+\t.\tID=PGEN_.00g000010.m01.exon01;Name=PGEN_.00g000010.m01.exon01;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon1;Alias=21510-PGEN_.00g234140.m01.exon1\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t1995\t2095\t.\t+\t.\tID=PGEN_.00g000010.m01.exon02;Name=PGEN_.00g000010.m01.exon02;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon2;Alias=21510-PGEN_.00g234140.m01.exon2\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t3325\t3495\t.\t+\t.\tID=PGEN_.00g000010.m01.exon03;Name=PGEN_.00g000010.m01.exon03;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon3;Alias=21510-PGEN_.00g234140.m01.exon3\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t4651\t4719\t.\t+\t.\tID=PGEN_.00g000010.m01.exon04;Name=PGEN_.00g000010.m01.exon04;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon4;Alias=21510-PGEN_.00g234140.m01.exon4\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t19808\t19943\t.\t-\t.\tID=PGEN_.00g000020.m01.exon01;Name=PGEN_.00g000020.m01.exon01;Parent=PGEN_.00g000020.m01;original_ID=21510-PGEN_.00g234150.m01.exon10;Alias=21510-PGEN_.00g234150.m01.exon10\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t21133\t21362\t.\t-\t.\tID=PGEN_.00g000020.m01.exon02;Name=PGEN_.00g000020.m01.exon02;Parent=PGEN_.00g000020.m01;original_ID=21510-PGEN_.00g234150.m01.exon9;Alias=21510-PGEN_.00g234150.m01.exon9\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t22487\t22613\t.\t-\t.\tID=PGEN_.00g000020.m01.exon03;Name=PGEN_.00g000020.m01.exon03;Parent=PGEN_.00g000020.m01;original_ID=21510-PGEN_.00g234150.m01.exon8;Alias=21510-PGEN_.00g234150.m01.exon8\n", "\n", "---------------------\n", "\n", "Preview BED: Panopea-generosa-v1.0.a4.exon.bed\n", "\n", "Scaffold_01\t2\t125\n", "Scaffold_01\t1995\t2095\n", "Scaffold_01\t3325\t3495\n", "Scaffold_01\t4651\t4719\n", "Scaffold_01\t19808\t19943\n", "Scaffold_01\t21133\t21362\n", "Scaffold_01\t22487\t22613\n", "Scaffold_01\t24824\t24959\n", "Scaffold_01\t25981\t26126\n", "Scaffold_01\t27969\t28019\n", "\n", "---------------------\n", "\n", "Preview GFF: Panopea-generosa-v1.0.a4.gene.gff3\n", "\n", "##gff-version 3\n", "##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tgene\t2\t4719\t.\t+\t.\tID=PGEN_.00g000010;Name=PGEN_.00g000010;original_ID=21510-PGEN_.00g234140;Alias=21510-PGEN_.00g234140;original_name=21510-PGEN_.00g234140;Notes=sp|Q86IC9|CAMT1_DICDI [BLAST protein vs protein (blastp) 2.7.1],PF01596.12 [Pfam 1.6]\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tgene\t19808\t36739\t.\t-\t.\tID=PGEN_.00g000020;Name=PGEN_.00g000020;original_ID=21510-PGEN_.00g234150;Alias=21510-PGEN_.00g234150;original_name=21510-PGEN_.00g234150;Notes=sp|P04177|TY3H_RAT [BLAST protein vs protein (blastp) 2.7.1],sp|P04177|TY3H_RAT [DIAMOND Functional 0.9.22],IPR036951 [InterProScan 5.29-68.0],PF00351.16 [Pfam 1.6]\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tgene\t49248\t52578\t.\t-\t.\tID=PGEN_.00g000030;Name=PGEN_.00g000030;original_ID=21510-PGEN_.00g234160;Alias=21510-PGEN_.00g234160;original_name=21510-PGEN_.00g234160;Notes=PF08054.6 [Pfam 1.6]\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tgene\t55792\t67546\t.\t+\t.\tID=PGEN_.00g000040;Name=PGEN_.00g000040;original_ID=21510-PGEN_.00g234170;Alias=21510-PGEN_.00g234170;original_name=21510-PGEN_.00g234170\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tgene\t67586\t69113\t.\t-\t.\tID=PGEN_.00g000050;Name=PGEN_.00g000050;original_ID=21510-PGEN_.00g234180;Alias=21510-PGEN_.00g234180;original_name=21510-PGEN_.00g234180;Notes=sp|Q8L840|RQL4A_ARATH [BLAST protein vs protein (blastp) 2.7.1],sp|Q8L840|RQL4A_ARATH [DIAMOND Functional 0.9.22],PF00270.24 [Pfam 1.6]\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tgene\t70713\t81099\t.\t+\t.\tID=PGEN_.00g000060;Name=PGEN_.00g000060;original_ID=21510-PGEN_.00g234190;Alias=21510-PGEN_.00g234190;original_name=21510-PGEN_.00g234190;Notes=sp|Q61043|NIN_MOUSE [DIAMOND Functional 0.9.22],PF04443.7 [Pfam 1.6]\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tgene\t183686\t186073\t.\t+\t.\tID=PGEN_.00g000070;Name=PGEN_.00g000070;original_ID=21510-PGEN_.00g234200;Alias=21510-PGEN_.00g234200;original_name=21510-PGEN_.00g234200;Notes=PF15364.1 [Pfam 1.6]\n", "\n", "---------------------\n", "\n", "Preview BED: Panopea-generosa-v1.0.a4.gene.bed\n", "\n", "Scaffold_01\t2\t4719\n", "Scaffold_01\t19808\t36739\n", "Scaffold_01\t49248\t52578\n", "Scaffold_01\t55792\t67546\n", "Scaffold_01\t67586\t69113\n", "Scaffold_01\t70713\t81099\n", "Scaffold_01\t183686\t186073\n", "Scaffold_01\t187328\t188353\n", "Scaffold_01\t189849\t190460\n", "Scaffold_01\t191069\t191410\n", "\n" ] } ], "source": [ "%%bash\n", "echo \"Preview GFF: ${exon_gff}\"\n", "echo \"\"\n", "head \"${exon_gff}\"\n", "echo \"\"\n", "echo \"---------------------\"\n", "echo \"\"\n", "\n", "# Create BED file\n", "# Skip first three lines (header) of GFF\n", "# Print scaffold name, start, and stop positions; tab-delimited\n", "awk 'NR > 3 {print $1\"\\t\"$4\"\\t\"$5}' \"${exon_gff}\" \\\n", "> \"${exon_bed}\"\n", "\n", "echo \"Preview BED: ${exon_bed}\"\n", "echo \"\"\n", "head \"${exon_bed}\"\n", "echo \"\"\n", "echo \"---------------------\"\n", "echo \"\"\n", "\n", "echo \"Preview GFF: ${gene_gff}\"\n", "echo \"\"\n", "head \"${gene_gff}\"\n", "echo \"\"\n", "echo \"---------------------\"\n", "echo \"\"\n", "\n", "# Create BED file\n", "# Skip first three lines (header) of GFF\n", "# Print scaffold name, start, and stop positions; tab-delimited\n", "awk 'NR > 3 {print $1\"\\t\"$4\"\\t\"$5}' \"${gene_gff}\" \\\n", "> \"${gene_bed}\"\n", "\n", "echo \"Preview BED: ${gene_bed}\"\n", "echo \"\"\n", "head \"${gene_bed}\"\n", "echo \"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Determine percent of genes containing introns" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "34947 found in Panopea-generosa-v1.0.a4.gene.bed\n", "\n", "----------------------------\n", "\n", "9772 genes with no introns\n", "\n", "----------------------------\n", "\n", ".7204 of genes contain introns\n" ] } ], "source": [ "%%bash\n", "# Count the number of genes\n", "num_genes=$(cat ${gene_bed} | wc -l)\n", "echo \"${num_genes} found in ${gene_bed}\"\n", "echo \"\"\n", "echo \"----------------------------\"\n", "echo \"\"\n", "\n", "# Determine number of genes with no intron overlap (that's the -v option below)\n", "num_genes_no_introns=$(${intersectbed} -v -a ${gene_bed} -b ${introns_bed} | wc -l)\n", "echo \"${num_genes_no_introns} genes with no introns\"\n", "echo \"\"\n", "echo \"----------------------------\"\n", "echo \"\"\n", "\n", "# Determine percentage of genes with introns.\n", "# Sets \"scale\" to get desired number of decimal places.\n", "percent_genes_w_introns=$(echo \"scale=4; 1 - (${num_genes_no_introns} / ${num_genes})\" | bc)\n", "\n", "\n", "echo \"${percent_genes_w_introns} of genes contain introns\"\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Stats summaries for the BED files" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.intergenic.bed\n", "-------------------------\n", "mean\t16393.607065170105\n", "min\t2.0\n", "median\t8114.0\n", "max\t370672.0\n", "sum\t565235178.0\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.gene.bed\n", "-------------------------\n", "mean\t10811.04461041005\n", "min\t166.0\n", "median\t4464.0\n", "max\t283066.0\n", "sum\t377813576.0\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.introns.bed\n", "-------------------------\n", "mean\t2184.529027548067\n", "min\t2.0\n", "median\t1199.0\n", "max\t93104.0\n", "sum\t338130141.0\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.exon.bed\n", "-------------------------\n", "mean\t201.4769876772451\n", "min\t3.0\n", "median\t133.0\n", "max\t13221.0\n", "sum\t47741987.0\n" ] } ], "source": [ "for file in os.listdir('.'):\n", " if fnmatch.fnmatch(file, '*.bed'):\n", " print('\\n' * 2)\n", " print(file)\n", " print(\"-------------------------\")\n", " \n", " # Import GFF.\n", " # Skip first 3 rows (gff header lines) and indicate file is tab-separated\n", " bed=pandas.read_csv(file, header=None, sep=\"\\t\")\n", " \n", " # Rename columns\n", " bed.columns = bed_header\n", " \n", " # Subtract start value from end value.\n", " # Have to add 1 so that sequence length can't equal zero (i.e. adjust for 1-based counting system)\n", " bed['seqlength'] = bed.apply(lambda position: position['end'] - position['start'] + 1, axis=1)\n", " \n", " # Apply functions in list to seqlength column\n", " bed_stats = bed['seqlength'].agg(['mean', 'min', 'median', 'max', 'sum'])\n", " \n", " # Print table of calculation type and the result\n", " for calc_type, calcs in bed_stats.iteritems():\n", " print(calc_type, calcs, sep='\\t')" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "### Clean up" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "rm: cannot remove 'Panopea-generosa-v1.0.a4.exon.gff3': No such file or directory\n", "rm: cannot remove 'Panopea-generosa-v1.0.a4.gene.gff3': No such file or directory\n", "rm: cannot remove 'Panopea-generosa-v1.0.a4.intergenic.bed': No such file or directory\n", "rm: cannot remove 'Panopea-generosa-v1.0.a4.introns.bed': No such file or directory\n" ] } ], "source": [ "%%bash\n", "# Cleanup\n", "rm ${exon_bed} ${exon_gff} ${gene_bed} ${gene_gff} ${intergenic_bed} ${introns_bed}\n", "\n", "ls -l" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }