{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# gathergrams - display gather results" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import pylab\n", "import numpy\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
intersect_bpf_orig_queryf_matchf_unique_to_queryf_unique_weightedaverage_abundmedian_abundstd_abundnamefilenamemd5f_match_origunique_intersect_bpgather_result_rankremaining_bp
093230000.0248850.9772540.0248850.0086496.8318146.05.847358GCA_000013645all-gather-genomes.sbt.zipee9cb05018009af43429e83e7d502a0a0.97725493230000365320000
172880000.0194530.9990400.0194530.02351723.76303518.029.637611GCA_000009705all-gather-genomes.sbt.zip264cfdad44548ad96c4a24b6a514a8770.99904072880001358032000
270390000.0187890.9998580.0187890.08956293.70024292.027.877040GCA_000196115all-gather-genomes.sbt.zip7141c3fbf206734950d13cadba0bf46c0.99985870390002350993000
366040000.0176270.9996970.0176270.01462116.30405816.06.636986GCA_000018565all-gather-genomes.sbt.zip5165e0f27ae96c8a78da00c2d30a8e360.99969766040003344389000
\n", "
" ], "text/plain": [ " intersect_bp f_orig_query f_match f_unique_to_query f_unique_weighted \\\n", "0 9323000 0.024885 0.977254 0.024885 0.008649 \n", "1 7288000 0.019453 0.999040 0.019453 0.023517 \n", "2 7039000 0.018789 0.999858 0.018789 0.089562 \n", "3 6604000 0.017627 0.999697 0.017627 0.014621 \n", "\n", " average_abund median_abund std_abund name \\\n", "0 6.831814 6.0 5.847358 GCA_000013645 \n", "1 23.763035 18.0 29.637611 GCA_000009705 \n", "2 93.700242 92.0 27.877040 GCA_000196115 \n", "3 16.304058 16.0 6.636986 GCA_000018565 \n", "\n", " filename md5 f_match_orig \\\n", "0 all-gather-genomes.sbt.zip ee9cb05018009af43429e83e7d502a0a 0.977254 \n", "1 all-gather-genomes.sbt.zip 264cfdad44548ad96c4a24b6a514a877 0.999040 \n", "2 all-gather-genomes.sbt.zip 7141c3fbf206734950d13cadba0bf46c 0.999858 \n", "3 all-gather-genomes.sbt.zip 5165e0f27ae96c8a78da00c2d30a8e36 0.999697 \n", "\n", " unique_intersect_bp gather_result_rank remaining_bp \n", "0 9323000 0 365320000 \n", "1 7288000 1 358032000 \n", "2 7039000 2 350993000 \n", "3 6604000 3 344389000 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample_id = 'SRR606249'\n", "df = pd.read_csv(f'outputs/{sample_id}.gather.csv')\n", "\n", "def fix_name(x):\n", " return \"_\".join(x.split('_')[:2]).split('.')[0]\n", "\n", "df['name'] = df['name'].apply(fix_name)\n", "df[:4]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Figure 1: gathergram" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "pylab.figure(num=None, figsize=(8, 6))\n", "\n", "pylab.plot(df.index, df.intersect_bp / 1e6, 'x', label='all hashes classified to this genome')\n", "pylab.plot(df.index, df.unique_intersect_bp / 1e6, 'o', label='hashes specific to this genome')\n", "#pylab.plot(df.index, df.remaining_bp / 1e6, '-', label='unclassified bp remaining')\n", "\n", "pylab.xlabel('genome gather rank')\n", "pylab.ylabel('num hashes (millions)')\n", "pylab.legend(loc='upper right')\n", "pylab.title('hu-s1 gather x genbank')\n", "\n", "pylab.savefig(f'/tmp/gathergram-{sample_id}.pdf')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python (myenv)", "language": "python", "name": "myenv" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }