{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Wed Sep 4 15:18:48 PDT 2019\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.6 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "swoose\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 24\n", "On-line CPU(s) list: 0-23\n", "Thread(s) per core: 2\n", "Core(s) per socket: 6\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 44\n", "Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz\n", "Stepping: 2\n", "CPU MHz: 2925.993\n", "BogoMIPS: 5851.93\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 12288K\n", "NUMA node0 CPU(s): 0-23\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp pti tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 70G 10G 830M 629M 59G 58G\n", "Swap: 4.7G 290M 4.4G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: wd=/home/sam/analyses/20190904_pgen_v074.a3_genome_feature_counts\n", "env: rysnc_owl=owl:/volume1/web/halfshell/genomic-databank/\n", "env: gffs=Panopea-generosa-vv0.74.a3.[Cegm]*.gff3\n", "env: wget_gffs=--directory-prefix=$/home/sam/analyses/20190904_pgen_v074.a3_genome_feature_counts --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a3.[Cegm]*.gff3' https://owl.fish.washington.edu/halfshell/genomic-databank/\n" ] } ], "source": [ "%env wd=/home/sam/analyses/20190904_pgen_v074.a3_genome_feature_counts\n", "wd=\"/home/sam/analyses/20190904_pgen_v074.a3_genome_feature_counts\"\n", "%env rysnc_owl=owl:/volume1/web/halfshell/genomic-databank/\n", "%env gffs=Panopea-generosa-vv0.74.a3.[Cegm]*.gff3\n", "%env wget_gffs=--directory-prefix=${wd} --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a3.[Cegm]*.gff3' https://owl.fish.washington.edu/halfshell/genomic-databank/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import Python modules" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import fnmatch\n", "import os\n", "import pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create necessary directories" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash\n", "mkdir --parents ${wd}" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/sam/analyses/20190904_pgen_v074.a3_genome_feature_counts\n" ] } ], "source": [ "cd {wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Download _Panopea generosa_ GFFs for v074.a3.\n", "\n", "Info on GFFs is here: [https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome-feature-tracks-3)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "receiving incremental file list\n", "./\n", "Panopea-generosa-vv0.74.a3.CDS.gff3\n", " 50,945,464 100% 2.09MB/s 0:00:23 (xfr#1, to-chk=3/5)\n", "Panopea-generosa-vv0.74.a3.exon.gff3\n", " 52,378,458 100% 467.22kB/s 0:01:49 (xfr#2, to-chk=2/5)\n", "Panopea-generosa-vv0.74.a3.gene.gff3\n", " 10,251,196 100% 429.17kB/s 0:00:23 (xfr#3, to-chk=1/5)\n", "Panopea-generosa-vv0.74.a3.mRNA.gff3\n", " 12,732,694 100% 210.54kB/s 0:00:59 (xfr#4, to-chk=0/5)\n", "\n", "sent 142 bytes received 126,323,558 bytes 514,556.82 bytes/sec\n", "total size is 126,307,812 speedup is 1.00\n", "\n", "\n", "----------------------------------------------------------\n", "total 121M\n", "-rw-rw-r-- 1 sam users 49M Sep 3 06:23 Panopea-generosa-vv0.74.a3.CDS.gff3\n", "-rw-rw-r-- 1 sam users 50M Sep 3 06:23 Panopea-generosa-vv0.74.a3.exon.gff3\n", "-rw-rw-r-- 1 sam users 9.8M Sep 3 06:23 Panopea-generosa-vv0.74.a3.gene.gff3\n", "-rw-rw-r-- 1 sam users 13M Sep 3 06:23 Panopea-generosa-vv0.74.a3.mRNA.gff3\n" ] } ], "source": [ "%%bash\n", "\n", "rsync \\\n", "--archive \\\n", "--verbose \\\n", "--progress \\\n", "--include=\"${gffs}\" \\\n", "--exclude=\"*\" \\\n", "\"${rysnc_owl}\" \\\n", ".\n", "\n", "echo \"\"\n", "echo \"\"\n", "echo \"----------------------------------------------------------\"\n", "\n", "ls -lh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### If need to download via wget, uncomment lines in the cell below" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# %%bash\n", "# time \\\n", "# wget \"${wget_gffs}\"\n", "\n", "# ls -lh ${wd}" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "##gff-version 3\n", "##Generated using GenSAS, Tuesday 3rd of September 2019 06:14:34 AM\n", "##Project Name : Pgenerosa_v074\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d6843e21278a-publish\tCDS\t1\t125\t.\t+\t2\tID=PGEN_.00g000010.m01.CDS01;Name=PGEN_.00g000010.m01.CDS01;Parent=PGEN_.00g000010.m01;original_ID=cds.19849-PGEN_.00g000010.m01;Alias=cds.19849-PGEN_.00g000010.m01\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d6843e21278a-publish\tCDS\t1995\t2095\t.\t+\t1\tID=PGEN_.00g000010.m01.CDS02;Name=PGEN_.00g000010.m01.CDS02;Parent=PGEN_.00g000010.m01;original_ID=cds.19849-PGEN_.00g000010.m01;Alias=cds.19849-PGEN_.00g000010.m01\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d6843e21278a-publish\tCDS\t3325\t3495\t.\t+\t0\tID=PGEN_.00g000010.m01.CDS03;Name=PGEN_.00g000010.m01.CDS03;Parent=PGEN_.00g000010.m01;original_ID=cds.19849-PGEN_.00g000010.m01;Alias=cds.19849-PGEN_.00g000010.m01\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d6843e21278a-publish\tCDS\t14288\t14485\t.\t+\t0\tID=PGEN_.00g000010.m01.CDS04;Name=PGEN_.00g000010.m01.CDS04;Parent=PGEN_.00g000010.m01;original_ID=cds.19849-PGEN_.00g000010.m01;Alias=cds.19849-PGEN_.00g000010.m01\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d6843e21278a-publish\tCDS\t17116\t17292\t.\t+\t0\tID=PGEN_.00g000010.m01.CDS05;Name=PGEN_.00g000010.m01.CDS05;Parent=PGEN_.00g000010.m01;original_ID=cds.19849-PGEN_.00g000010.m01;Alias=cds.19849-PGEN_.00g000010.m01\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d6843e21278a-publish\tCDS\t19808\t19943\t.\t-\t2\tID=PGEN_.00g000020.m01.CDS01;Name=PGEN_.00g000020.m01.CDS01;Parent=PGEN_.00g000020.m01;original_ID=cds.19849-PGEN_.00g000020.m01;Alias=cds.19849-PGEN_.00g000020.m01\n", "PGA_scaffold1__77_contigs__length_89643857\tGenSAS_5d6843e21278a-publish\tCDS\t21133\t21362\t.\t-\t0\tID=PGEN_.00g000020.m01.CDS02;Name=PGEN_.00g000020.m01.CDS02;Parent=PGEN_.00g000020.m01;original_ID=cds.19849-PGEN_.00g000020.m01;Alias=cds.19849-PGEN_.00g000020.m01\n" ] } ], "source": [ "%%bash\n", "head Panopea-generosa-vv0.74.a3.CDS.gff3" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# Set list of column header names\n", "gff_header = ['seqid','source','type','start','end','score','strand','phase','attributes']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get sequence length stats for " ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "Panopea-generosa-vv0.74.a3.exon.gff3\n", "-------------------------\n", "mean 255.932825\n", "min 3.000000\n", "median 157.000000\n", "max 13359.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-vv0.74.a3.CDS.gff3\n", "-------------------------\n", "mean 255.932825\n", "min 3.000000\n", "median 157.000000\n", "max 13359.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-vv0.74.a3.mRNA.gff3\n", "-------------------------\n", "mean 13318.053183\n", "min 201.000000\n", "median 2346.000000\n", "max 345225.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-vv0.74.a3.gene.gff3\n", "-------------------------\n", "mean 13318.053183\n", "min 201.000000\n", "median 2346.000000\n", "max 345225.000000\n", "Name: seqlength, dtype: float64\n" ] } ], "source": [ "for file in os.listdir('.'):\n", " if fnmatch.fnmatch(file, 'Panopea-generosa-vv0.74.a3*.gff3'):\n", " print('\\n' * 2)\n", " print(file)\n", " print(\"-------------------------\")\n", " \n", " # Import GFF.\n", " # Skip first row (gff header line) and indicate file is tab-separated\n", " gff=pandas.read_csv(file, header=None, skiprows=3, sep=\"\\t\")\n", " \n", " # Rename columns\n", " gff.columns = gff_header\n", " \n", " # Subtract start value from end value.\n", " # Have to add 1 so that sequence length can't equal zero (i.e. adjust for 1-based counting system)\n", " gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1)\n", " \n", " # Apply functions in list to seqlength column\n", " gff_stats = gff['seqlength'].agg(['mean', 'min', 'median', 'max'])\n", " \n", " print (gff_stats)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }