{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# IPython parallel computing clusters" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "from functools import partial" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create a fake dataset of SNV" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "CHROMOSOMES = list(range(1, 23)) + ['X', 'Y']\n", "BASES = list('ACGT')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Create a dataframe\n", "df = pd.DataFrame({'Chromosome': np.random.choice(CHROMOSOMES, size=5000, replace=True, p=None),\n", " 'Position': np.random.randint(1000, 10000, size=5000),\n", " 'Reference': np.random.choice(BASES, size=5000, replace=True, p=None)})\n", "\n", "df['Alternate'] = df.apply(lambda x: np.random.choice([i for i in BASES if i != x['Reference']]), axis=1)\n", "df = df[['Chromosome', 'Position', 'Reference', 'Alternate']]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ChromosomePositionReferenceAlternate
0183313TC
1223062TC
2112584CA
362867CA
4208704TC
\n", "
" ], "text/plain": [ " Chromosome Position Reference Alternate\n", "0 18 3313 T C\n", "1 22 3062 T C\n", "2 11 2584 C A\n", "3 6 2867 C A\n", "4 20 8704 T C" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(5)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(5000, 4)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Check if the mutations overlap by considering windows of 100bp" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Non parallel version" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def overlap(line, positions):\n", " return len([i for i in positions if line['Position'] - 100 <= i <= line['Position'] + 100])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 16.9 s, sys: 82.8 ms, total: 17 s\n", "Wall time: 17.6 s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/loris/.virtualenvs/python3_meetup/lib/python3.4/site-packages/IPython/kernel/__main__.py:6: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" ] } ], "source": [ "%%time\n", "\n", "grouped = df.groupby(['Chromosome'])\n", "results = []\n", "for count, group in grouped: \n", " positions = group['Position'].tolist()\n", " group['Overlap'] = group.apply(partial(overlap, positions=positions), axis=1)\n", " results.append(group)\n", " \n", "results = pd.concat(results)\n", "results.reset_index()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ChromosomePositionReferenceAlternateOverlap
1314864GA6
1519839GC5
2318972TC3
5816162AG5
6811365TG6
\n", "
" ], "text/plain": [ " Chromosome Position Reference Alternate Overlap\n", "13 1 4864 G A 6\n", "15 1 9839 G C 5\n", "23 1 8972 T C 3\n", "58 1 6162 A G 5\n", "68 1 1365 T G 6" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results.head(5)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(5000, 5)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Parallel version" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from IPython.parallel import Client" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "4" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.cpu_count()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "4" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "c = Client()\n", "pool = c[:]\n", "len(c.ids)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%%px --local\n", "\n", "from functools import partial\n", "\n", "\n", "def overlap(line, positions):\n", " return len([i for i in positions if line['Position'] - 100 <= i <= line['Position'] + 100])\n", "\n", "\n", "def parse_group(items):\n", " count, group = items\n", " positions = group['Position'].tolist()\n", " group['Overlap'] = group.apply(partial(overlap, positions=positions), axis=1)\n", " return group" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.28 s, sys: 271 ms, total: 3.55 s\n", "Wall time: 10.6 s\n" ] } ], "source": [ "%%time\n", "\n", "grouped = df.groupby(['Chromosome'])\n", "results_parallel = []\n", "for result in pool.map(parse_group, grouped):\n", " results_parallel.append(result)\n", " \n", "results_parallel = pd.concat(results_parallel)\n", "results_parallel.reset_index()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ChromosomePositionReferenceAlternateOverlap
1314864GA6
1519839GC5
2318972TC3
5816162AG5
6811365TG6
\n", "
" ], "text/plain": [ " Chromosome Position Reference Alternate Overlap\n", "13 1 4864 G A 6\n", "15 1 9839 G C 5\n", "23 1 8972 T C 3\n", "58 1 6162 A G 5\n", "68 1 1365 T G 6" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results_parallel.head(5)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(5000, 5)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results_parallel.shape" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" } }, "nbformat": 4, "nbformat_minor": 0 }