{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# IPython parallel computing clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "from functools import partial"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a fake dataset of SNV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "CHROMOSOMES = list(range(1, 23)) + ['X', 'Y']\n",
    "BASES = list('ACGT')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Create a dataframe\n",
    "df = pd.DataFrame({'Chromosome': np.random.choice(CHROMOSOMES, size=5000, replace=True, p=None),\n",
    "                   'Position': np.random.randint(1000, 10000, size=5000),\n",
    "                   'Reference': np.random.choice(BASES, size=5000, replace=True, p=None)})\n",
    "\n",
    "df['Alternate'] = df.apply(lambda x: np.random.choice([i for i in BASES if i != x['Reference']]), axis=1)\n",
    "df = df[['Chromosome', 'Position', 'Reference', 'Alternate']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Chromosome</th>\n",
       "      <th>Position</th>\n",
       "      <th>Reference</th>\n",
       "      <th>Alternate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>18</td>\n",
       "      <td>3313</td>\n",
       "      <td>T</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>22</td>\n",
       "      <td>3062</td>\n",
       "      <td>T</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>11</td>\n",
       "      <td>2584</td>\n",
       "      <td>C</td>\n",
       "      <td>A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>6</td>\n",
       "      <td>2867</td>\n",
       "      <td>C</td>\n",
       "      <td>A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>20</td>\n",
       "      <td>8704</td>\n",
       "      <td>T</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Chromosome  Position Reference Alternate\n",
       "0         18      3313         T         C\n",
       "1         22      3062         T         C\n",
       "2         11      2584         C         A\n",
       "3          6      2867         C         A\n",
       "4         20      8704         T         C"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5000, 4)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Check if the mutations overlap by considering windows of 100bp"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Non parallel version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def overlap(line, positions):\n",
    "    return len([i for i in positions if line['Position'] - 100 <= i <= line['Position'] + 100])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 16.9 s, sys: 82.8 ms, total: 17 s\n",
      "Wall time: 17.6 s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/loris/.virtualenvs/python3_meetup/lib/python3.4/site-packages/IPython/kernel/__main__.py:6: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "grouped = df.groupby(['Chromosome'])\n",
    "results = []\n",
    "for count, group in grouped:    \n",
    "    positions = group['Position'].tolist()\n",
    "    group['Overlap'] = group.apply(partial(overlap, positions=positions), axis=1)\n",
    "    results.append(group)\n",
    "    \n",
    "results = pd.concat(results)\n",
    "results.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Chromosome</th>\n",
       "      <th>Position</th>\n",
       "      <th>Reference</th>\n",
       "      <th>Alternate</th>\n",
       "      <th>Overlap</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>1</td>\n",
       "      <td>4864</td>\n",
       "      <td>G</td>\n",
       "      <td>A</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>1</td>\n",
       "      <td>9839</td>\n",
       "      <td>G</td>\n",
       "      <td>C</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>1</td>\n",
       "      <td>8972</td>\n",
       "      <td>T</td>\n",
       "      <td>C</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58</th>\n",
       "      <td>1</td>\n",
       "      <td>6162</td>\n",
       "      <td>A</td>\n",
       "      <td>G</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>68</th>\n",
       "      <td>1</td>\n",
       "      <td>1365</td>\n",
       "      <td>T</td>\n",
       "      <td>G</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Chromosome  Position Reference Alternate  Overlap\n",
       "13          1      4864         G         A        6\n",
       "15          1      9839         G         C        5\n",
       "23          1      8972         T         C        3\n",
       "58          1      6162         A         G        5\n",
       "68          1      1365         T         G        6"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5000, 5)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Parallel version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from IPython.parallel import Client"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "os.cpu_count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c = Client()\n",
    "pool = c[:]\n",
    "len(c.ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%%px --local\n",
    "\n",
    "from functools import partial\n",
    "\n",
    "\n",
    "def overlap(line, positions):\n",
    "    return len([i for i in positions if line['Position'] - 100 <= i <= line['Position'] + 100])\n",
    "\n",
    "\n",
    "def parse_group(items):\n",
    "    count, group = items\n",
    "    positions = group['Position'].tolist()\n",
    "    group['Overlap'] = group.apply(partial(overlap, positions=positions), axis=1)\n",
    "    return group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.28 s, sys: 271 ms, total: 3.55 s\n",
      "Wall time: 10.6 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "grouped = df.groupby(['Chromosome'])\n",
    "results_parallel = []\n",
    "for result in pool.map(parse_group, grouped):\n",
    "    results_parallel.append(result)\n",
    "    \n",
    "results_parallel = pd.concat(results_parallel)\n",
    "results_parallel.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Chromosome</th>\n",
       "      <th>Position</th>\n",
       "      <th>Reference</th>\n",
       "      <th>Alternate</th>\n",
       "      <th>Overlap</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>1</td>\n",
       "      <td>4864</td>\n",
       "      <td>G</td>\n",
       "      <td>A</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>1</td>\n",
       "      <td>9839</td>\n",
       "      <td>G</td>\n",
       "      <td>C</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>1</td>\n",
       "      <td>8972</td>\n",
       "      <td>T</td>\n",
       "      <td>C</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58</th>\n",
       "      <td>1</td>\n",
       "      <td>6162</td>\n",
       "      <td>A</td>\n",
       "      <td>G</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>68</th>\n",
       "      <td>1</td>\n",
       "      <td>1365</td>\n",
       "      <td>T</td>\n",
       "      <td>G</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Chromosome  Position Reference Alternate  Overlap\n",
       "13          1      4864         G         A        6\n",
       "15          1      9839         G         C        5\n",
       "23          1      8972         T         C        3\n",
       "58          1      6162         A         G        5\n",
       "68          1      1365         T         G        6"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results_parallel.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5000, 5)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results_parallel.shape"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}