{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Mon Aug 26 14:25:46 PDT 2019\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.6 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "swoose\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 24\n", "On-line CPU(s) list: 0-23\n", "Thread(s) per core: 2\n", "Core(s) per socket: 6\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 44\n", "Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz\n", "Stepping: 2\n", "CPU MHz: 2925.993\n", "BogoMIPS: 5851.93\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 12288K\n", "NUMA node0 CPU(s): 0-23\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp pti tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 70G 8.1G 42G 472M 19G 61G\n", "Swap: 4.7G 0B 4.7G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: wd=/home/sam/analyses/20190826_pgen_genome_feature_counts\n", "env: gffs=owl:/volume1/web/halfshell/genomic-databank/Pgenerosa_v07[04]*.gff\n", "env: wget_gffs=--directory-prefix=$/home/sam/analyses/20190826_pgen_genome_feature_counts --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Pgenerosa_v07[04]*.gff' https://owl.fish.washington.edu/halfshell/genomic-databank/\n" ] } ], "source": [ "%env wd=/home/sam/analyses/20190826_pgen_genome_feature_counts\n", "wd=\"/home/sam/analyses/20190826_pgen_genome_feature_counts\"\n", "%env gffs=owl:/volume1/web/halfshell/genomic-databank/Pgenerosa_v07[04]*.gff\n", "%env wget_gffs=--directory-prefix=${wd} --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Pgenerosa_v07[04]*.gff' https://owl.fish.washington.edu/halfshell/genomic-databank/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import Python modules" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import fnmatch\n", "import os\n", "import pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create necessary directories" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash\n", "mkdir --parents ${wd}" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/sam/analyses/20190826_pgen_genome_feature_counts\n" ] } ], "source": [ "cd {wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Download _Panopea generosa_ GFFs for v070, v070 top 18 scaffolds, and v074.\n", "\n", "Info on GFFs is here: [https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome-feature-tracks-3)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "receiving incremental file list\n", "Pgenerosa_v074.CpG.gff\n", " 2,623,644,046 100% 28.14MB/s 0:01:28 (xfr#1, to-chk=6/18)\n", "Pgenerosa_v074.fa.out.gff\n", " 62,754,222 100% 20.47MB/s 0:00:02 (xfr#2, to-chk=4/18)\n", "\n", "sent 49 bytes received 2,686,726,727 bytes 29,045,694.88 bytes/sec\n", "total size is 2,778,393,209 speedup is 1.03\n", "\n", "\n", "----------------------------------------------------------\n", "total 88M\n", "-rw-rw-r-- 1 sam users 20M May 23 15:02 Pgenerosa_v070.CDS.gff\n", "-rw-rw-r-- 1 sam users 22M May 23 15:02 Pgenerosa_v070.exon.gff\n", "-rw-rw-r-- 1 sam users 12M May 23 15:03 Pgenerosa_v070.gene.gff\n", "-rw-rw-r-- 1 sam users 16M May 23 15:03 Pgenerosa_v070.mRNA.gff\n", "-rw-rw-r-- 1 sam users 5.0M Aug 26 09:53 Pgenerosa_v070_top18_scaffolds.CDS.gff\n", "-rw-rw-r-- 1 sam users 5.6M Aug 26 09:54 Pgenerosa_v070_top18_scaffolds.exon.gff\n", "-rw-rw-r-- 1 sam users 500K Aug 26 11:32 Pgenerosa_v070_top18_scaffolds.five_prime_UTR.gff\n", "-rw-r--r-- 1 sam users 2.0M Aug 21 09:03 Pgenerosa_v070_top18_scaffolds.gene.gff\n", "-rw-rw-r-- 1 sam users 2.7M Aug 26 09:54 Pgenerosa_v070_top18_scaffolds.mRNA.gff\n", "-rw-rw-r-- 1 sam users 522K Aug 26 11:32 Pgenerosa_v070_top18_scaffolds.three_prime_UTR.gff\n", "-rw-rw-r-- 1 sam users 1.3M Jul 10 08:00 Pgenerosa_v074.CDS.gff\n", "-rw-rw-r-- 1 sam users 1.4M Jul 10 08:01 Pgenerosa_v074.exon.gff\n", "-rw-rw-r-- 1 sam users 102K Aug 22 15:41 Pgenerosa_v074.five_prime_UTR.gff\n", "-rw-rw-r-- 1 sam users 577K Jul 10 08:01 Pgenerosa_v074.gene.gff\n", "-rw-rw-r-- 1 sam users 716K Jul 10 08:02 Pgenerosa_v074.mRNA.gff\n", "-rw-rw-r-- 1 sam users 52K Aug 22 15:41 Pgenerosa_v074.three_prime_UTR.gff\n" ] } ], "source": [ "%%bash\n", "\n", "rsync \\\n", "--archive \\\n", "--verbose \\\n", "--progress \\\n", "\"${gffs}\" .\n", "\n", "rm Pgenerosa_v074.fa.out.gff Pgenerosa_v074.CpG.gff\n", "\n", "echo \"\"\n", "echo \"\"\n", "echo \"----------------------------------------------------------\"\n", "\n", "ls -lh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### If need to download via wget, uncomment lines in the cell below" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# %%bash\n", "# time \\\n", "# wget \"${wget_gffs}\"\n", "\n", "# ls -lh ${wd}" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Set list of column header names\n", "gff_header = ['seqid','source','type','start','end','score','strand','phase','attributes']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get sequence length stats for " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "Pgenerosa_v070.exon.gff\n", "-------------------------\n", "mean 217.971132\n", "min 3.000000\n", "median 139.000000\n", "max 16912.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Pgenerosa_v070_top18_scaffolds.three_prime_UTR.gff\n", "-------------------------\n", "mean 481.02338\n", "min 1.00000\n", "median 221.00000\n", "max 8355.00000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Pgenerosa_v070_top18_scaffolds.five_prime_UTR.gff\n", "-------------------------\n", "mean 148.558072\n", "min 1.000000\n", "median 89.000000\n", "max 16912.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Pgenerosa_v070_top18_scaffolds.exon.gff\n", "-------------------------\n", "mean 219.024085\n", "min 3.000000\n", "median 131.000000\n", "max 16912.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Pgenerosa_v074.five_prime_UTR.gff\n", "-------------------------\n", "mean 62.226496\n", "min 1.000000\n", "median 41.000000\n", "max 1259.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Pgenerosa_v070_top18_scaffolds.mRNA.gff\n", "-------------------------\n", "mean 8696.297953\n", "min 180.000000\n", "median 4621.000000\n", "max 139826.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Pgenerosa_v074.CDS.gff\n", "-------------------------\n", "mean 193.248049\n", "min 1.000000\n", "median 126.000000\n", "max 11772.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Pgenerosa_v070_top18_scaffolds.gene.gff\n", "-------------------------\n", "mean 8696.297953\n", "min 180.000000\n", "median 4621.000000\n", "max 139826.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Pgenerosa_v074.mRNA.gff\n", "-------------------------\n", "mean 12170.897196\n", "min 192.000000\n", "median 5552.000000\n", "max 175905.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Pgenerosa_v074.three_prime_UTR.gff\n", "-------------------------\n", "mean 516.604585\n", "min 1.000000\n", "median 251.000000\n", "max 4080.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Pgenerosa_v074.gene.gff\n", "-------------------------\n", "mean 12170.897196\n", "min 192.000000\n", "median 5552.000000\n", "max 175905.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Pgenerosa_v070.CDS.gff\n", "-------------------------\n", "mean 194.539909\n", "min 1.000000\n", "median 132.000000\n", "max 11048.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Pgenerosa_v070.gene.gff\n", "-------------------------\n", "mean 3768.232394\n", "min 177.000000\n", "median 1474.000000\n", "max 139826.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Pgenerosa_v070.mRNA.gff\n", "-------------------------\n", "mean 3768.232394\n", "min 177.000000\n", "median 1474.000000\n", "max 139826.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Pgenerosa_v074.exon.gff\n", "-------------------------\n", "mean 210.612431\n", "min 3.000000\n", "median 130.000000\n", "max 11772.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Pgenerosa_v070_top18_scaffolds.CDS.gff\n", "-------------------------\n", "mean 177.728604\n", "min 1.000000\n", "median 123.000000\n", "max 9851.000000\n", "Name: seqlength, dtype: float64\n" ] } ], "source": [ "for file in os.listdir('.'):\n", " if fnmatch.fnmatch(file, 'Pgenerosa_v07[04]*.gff'):\n", " print('\\n' * 2)\n", " print(file)\n", " print(\"-------------------------\")\n", " \n", " # Import GFF.\n", " # Skip first row (gff header line) and indicate file is tab-separated\n", " gff=pandas.read_csv(file, header=None, skiprows=1, sep=\"\\t\")\n", " \n", " # Rename columns\n", " gff.columns = gff_header\n", " \n", " # Subtract start value from end value.\n", " # Have to add 1 so that sequence length can't equal zero (i.e. adjust for 1-based counting system)\n", " gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1)\n", " \n", " # Apply functions in list to seqlength column\n", " gff_stats = gff['seqlength'].agg(['mean', 'min', 'median', 'max'])\n", " \n", " print (gff_stats)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }