{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# IPython parallel computing clusters"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"from functools import partial"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create a fake dataset of SNV"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"CHROMOSOMES = list(range(1, 23)) + ['X', 'Y']\n",
"BASES = list('ACGT')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Create a dataframe\n",
"df = pd.DataFrame({'Chromosome': np.random.choice(CHROMOSOMES, size=5000, replace=True, p=None),\n",
" 'Position': np.random.randint(1000, 10000, size=5000),\n",
" 'Reference': np.random.choice(BASES, size=5000, replace=True, p=None)})\n",
"\n",
"df['Alternate'] = df.apply(lambda x: np.random.choice([i for i in BASES if i != x['Reference']]), axis=1)\n",
"df = df[['Chromosome', 'Position', 'Reference', 'Alternate']]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" Chromosome | \n",
" Position | \n",
" Reference | \n",
" Alternate | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 18 | \n",
" 3313 | \n",
" T | \n",
" C | \n",
"
\n",
" \n",
" 1 | \n",
" 22 | \n",
" 3062 | \n",
" T | \n",
" C | \n",
"
\n",
" \n",
" 2 | \n",
" 11 | \n",
" 2584 | \n",
" C | \n",
" A | \n",
"
\n",
" \n",
" 3 | \n",
" 6 | \n",
" 2867 | \n",
" C | \n",
" A | \n",
"
\n",
" \n",
" 4 | \n",
" 20 | \n",
" 8704 | \n",
" T | \n",
" C | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Chromosome Position Reference Alternate\n",
"0 18 3313 T C\n",
"1 22 3062 T C\n",
"2 11 2584 C A\n",
"3 6 2867 C A\n",
"4 20 8704 T C"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(5000, 4)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Check if the mutations overlap by considering windows of 100bp"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Non parallel version"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def overlap(line, positions):\n",
" return len([i for i in positions if line['Position'] - 100 <= i <= line['Position'] + 100])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 16.9 s, sys: 82.8 ms, total: 17 s\n",
"Wall time: 17.6 s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/loris/.virtualenvs/python3_meetup/lib/python3.4/site-packages/IPython/kernel/__main__.py:6: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
]
}
],
"source": [
"%%time\n",
"\n",
"grouped = df.groupby(['Chromosome'])\n",
"results = []\n",
"for count, group in grouped: \n",
" positions = group['Position'].tolist()\n",
" group['Overlap'] = group.apply(partial(overlap, positions=positions), axis=1)\n",
" results.append(group)\n",
" \n",
"results = pd.concat(results)\n",
"results.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Chromosome | \n",
" Position | \n",
" Reference | \n",
" Alternate | \n",
" Overlap | \n",
"
\n",
" \n",
" \n",
" \n",
" 13 | \n",
" 1 | \n",
" 4864 | \n",
" G | \n",
" A | \n",
" 6 | \n",
"
\n",
" \n",
" 15 | \n",
" 1 | \n",
" 9839 | \n",
" G | \n",
" C | \n",
" 5 | \n",
"
\n",
" \n",
" 23 | \n",
" 1 | \n",
" 8972 | \n",
" T | \n",
" C | \n",
" 3 | \n",
"
\n",
" \n",
" 58 | \n",
" 1 | \n",
" 6162 | \n",
" A | \n",
" G | \n",
" 5 | \n",
"
\n",
" \n",
" 68 | \n",
" 1 | \n",
" 1365 | \n",
" T | \n",
" G | \n",
" 6 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Chromosome Position Reference Alternate Overlap\n",
"13 1 4864 G A 6\n",
"15 1 9839 G C 5\n",
"23 1 8972 T C 3\n",
"58 1 6162 A G 5\n",
"68 1 1365 T G 6"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(5000, 5)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Parallel version"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from IPython.parallel import Client"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.cpu_count()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"c = Client()\n",
"pool = c[:]\n",
"len(c.ids)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%%px --local\n",
"\n",
"from functools import partial\n",
"\n",
"\n",
"def overlap(line, positions):\n",
" return len([i for i in positions if line['Position'] - 100 <= i <= line['Position'] + 100])\n",
"\n",
"\n",
"def parse_group(items):\n",
" count, group = items\n",
" positions = group['Position'].tolist()\n",
" group['Overlap'] = group.apply(partial(overlap, positions=positions), axis=1)\n",
" return group"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.28 s, sys: 271 ms, total: 3.55 s\n",
"Wall time: 10.6 s\n"
]
}
],
"source": [
"%%time\n",
"\n",
"grouped = df.groupby(['Chromosome'])\n",
"results_parallel = []\n",
"for result in pool.map(parse_group, grouped):\n",
" results_parallel.append(result)\n",
" \n",
"results_parallel = pd.concat(results_parallel)\n",
"results_parallel.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Chromosome | \n",
" Position | \n",
" Reference | \n",
" Alternate | \n",
" Overlap | \n",
"
\n",
" \n",
" \n",
" \n",
" 13 | \n",
" 1 | \n",
" 4864 | \n",
" G | \n",
" A | \n",
" 6 | \n",
"
\n",
" \n",
" 15 | \n",
" 1 | \n",
" 9839 | \n",
" G | \n",
" C | \n",
" 5 | \n",
"
\n",
" \n",
" 23 | \n",
" 1 | \n",
" 8972 | \n",
" T | \n",
" C | \n",
" 3 | \n",
"
\n",
" \n",
" 58 | \n",
" 1 | \n",
" 6162 | \n",
" A | \n",
" G | \n",
" 5 | \n",
"
\n",
" \n",
" 68 | \n",
" 1 | \n",
" 1365 | \n",
" T | \n",
" G | \n",
" 6 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Chromosome Position Reference Alternate Overlap\n",
"13 1 4864 G A 6\n",
"15 1 9839 G C 5\n",
"23 1 8972 T C 3\n",
"58 1 6162 A G 5\n",
"68 1 1365 T G 6"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results_parallel.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(5000, 5)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results_parallel.shape"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}