{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%matplotlib inline\n", "%load_ext autoreload\n", "%autoreload 2\n", "import matplotlib.pyplot as plt\n", "import os.path as op\n", "import pandas as pd\n", "import numpy as np\n", "import seaborn as sn\n", "\n", "sn.set(style=\"whitegrid\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from mriqc.classifier import data as mcd\n", "abide, _ = mcd.read_dataset(x_path, y_path, rate_label='rater_1')\n", "sites = list(sorted(set(abide.site.values.ravel())))\n", "\n", "fmt = r'{site} & \\pixmat{{{size[0]:d}$\\pm${sr[0]:d}}}{{{size[1]:d}$\\pm${sr[1]:d}}}{{{size[2]:d}$\\pm${sr[1]:d}}}'\n", "fmt += r'& \\pixmat[mm]{{{sp[0]:.2f}$\\pm${spr[0]:.2f}}}{{{sp[1]:.2f}$\\pm${spr[1]:.2f}}}{{{sp[2]:.2f}$\\pm${spr[1]:.2f}}}'\n", "\n", "\n", "for site in sites:\n", " subabide = abide.loc[abide.site.str.contains(site)]\n", " \n", " medians = np.median(subabide[['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']],\n", " axis=0)\n", " \n", " mins = np.abs(medians - np.min(\n", " subabide[['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']], axis=0))\n", "\n", " maxs = np.abs(medians - np.max(\n", " subabide[['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']], axis=0))\n", "\n", " ranges = np.max(np.vstack((maxs, mins)), axis=0)\n", " \n", " print(\n", " fmt.format(\n", " site=site,\n", " size=tuple(medians[:3].astype(int)),\n", " sr=tuple(ranges[:3].astype(int)),\n", " sp=tuple(medians[3:]),\n", " spr=tuple(ranges[3:]),\n", "\n", "))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#data_path = '/home/oesteban/Google Drive/mriqc'\n", "data_path = '/home/oesteban/tmp/mriqc-ml-tests-2/'\n", "out_path = data_path\n", "loso = pd.read_csv(op.join(data_path, 'cv_loso_inner.csv'), index_col=False)\n", "kfold = pd.read_csv(op.join(data_path, 'cv_kfold_inner.csv'), index_col=False)\n", "\n", "kfold_outer = pd.read_csv(op.join(data_path, 'cv_kfold_outer.csv'), index_col=False)\n", "loso_outer = pd.read_csv(op.join(data_path, 'cv_loso_outer.csv'), index_col=False)\n", "\n", "def gen_newparams(dataframe):\n", " thisdf = dataframe.copy()\n", " thisdf['zscored_str'] = ['nzs'] * len(thisdf['zscored'])\n", " thisdf.loc[thisdf.zscored == 1, 'zscored_str'] = 'zs'\n", " thisdf['params'] = thisdf['clf'] + '-' + thisdf['zscored_str'] + ' ' + thisdf['params']\n", " del thisdf['zscored_str']\n", " return thisdf\n", "\n", "loso = gen_newparams(loso)\n", "kfold = gen_newparams(kfold)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "loso_models_list = list(set(loso.params.values.ravel().tolist()))\n", "kfold_models_list = list(set(kfold.params.values.ravel().tolist()))\n", "\n", "best_param = {}\n", "\n", "spstr = ['LoSo', '10-fold']\n", "best_models = {}\n", "for i, split_cv in enumerate([loso, kfold]):\n", " best_models[spstr[i]] = {}\n", " splitcols = [col for col in split_cv.columns.ravel() if col.startswith('split0')]\n", " for clf in ['svc_linear-nzs', 'svc_rbf-nzs', 'rfc-nzs', 'svc_linear-zs', 'svc_rbf-zs', 'rfc-zs']:\n", " thismodeldf = split_cv.loc[split_cv.params.str.contains(clf)]\n", " max_auc = thismodeldf.mean_auc.max()\n", " best = thismodeldf.loc[thismodeldf.mean_auc >= max_auc]\n", " best_list = best.params.values.ravel().tolist()\n", " \n", " if len(best_list) == 1:\n", " best_models[spstr[i]][clf] = best_list[0]\n", " else:\n", " overall_means = [thismodeldf.loc[thismodeldf.params.str.contains(pset), 'mean_auc'].mean()\n", " for pset in best_list]\n", " overall_max = np.max(overall_means)\n", " if sum([val >= overall_max for val in overall_means]) == 1:\n", " best_models[spstr[i]][clf] = best_list[np.argmax(overall_means)]\n", " else:\n", " best_models[spstr[i]][clf] = best_list[0]\n", " \n", "newdict = {'AUC': [], 'Classifier': [], 'Split scheme': []}\n", "\n", "modelnames = {'rfc-nzs': 'RFC-nzs', 'rfc-zs': 'RFC-zs',\n", " 'svc_linear-nzs': 'SVC_lin-nzs', 'svc_linear-zs': 'SVC_lin-zs',\n", " 'svc_rbf-nzs': 'SVC_rbf-nzs', 'svc_rbf-zs': 'SVC_rbf-zs'}\n", "\n", "for key, val in list(best_models['LoSo'].items()):\n", " scores = loso.loc[loso.params.str.contains(val), 'mean_auc'].values.ravel().tolist()\n", " nscores = len(scores)\n", " \n", " newdict['AUC'] += scores\n", " newdict['Classifier'] += [modelnames[key]] * nscores\n", " newdict['Split scheme'] += ['LoSo (16 folds)'] * nscores\n", " \n", "for key, val in list(best_models['10-fold'].items()):\n", " scores = kfold.loc[kfold.params.str.contains(val), 'mean_auc'].values.ravel().tolist()\n", " nscores = len(scores)\n", " \n", " newdict['AUC'] += scores\n", " newdict['Classifier'] += [modelnames[key]] * nscores\n", " newdict['Split scheme'] += ['10-fold'] * nscores\n", "\n", "newdf = pd.DataFrame(newdict).sort_values(by=['Split scheme', 'Classifier'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def plot_cv_outer(data, score='auc', zscored=0, ax=None, ds030_score=None,\n", " split_type='LoSo', color='dodgerblue'):\n", " \n", " if ax is None:\n", " ax = plt.gca()\n", " \n", " outer_score = data.loc[data[score].notnull(), [score, 'zscored']]\n", " sn.distplot(outer_score.loc[outer_score.zscored==zscored, score],\n", " hist=True, norm_hist=True, ax=ax, color=color, label=split_type)\n", " ax.set_xlim([0.4, 1.0])\n", " ax.grid(False)\n", " ax.set_yticklabels([])\n", " \n", " mean = outer_score.loc[outer_score.zscored==zscored, score].mean()\n", " std = outer_score.loc[outer_score.zscored==zscored, score].std()\n", "\n", " mean_coord = draw_line(mean, ax=ax, color=color, lw=2.0, marker='o', extend=True)\n", " \n", " ymax = ax.get_ylim()[1]\n", " draw_line(mean - std, ax=ax, color=color, extend=True)\n", " draw_line(mean + std, ax=ax, color=color, extend=True)\n", " \n", " \n", " ax.annotate(\n", " '$\\mu$=%0.3f' % mean, xy=(mean_coord[0], 0.75*ymax), xytext=(-35, 30),\n", " textcoords='offset points', va='center', color='w', size=14,\n", " bbox=dict(boxstyle='round', fc=color, ec='none', color='none', lw=0),\n", " arrowprops=dict(\n", " arrowstyle='wedge,tail_width=0.8', lw=0, patchA=None, patchB=None,\n", " fc=color, ec='none', relpos=(0.5, 0.5)))\n", " sigmay = 0.70*ymax\n", " ax.annotate(s='', xy=(mean - std, sigmay), xytext=(mean + std, sigmay), arrowprops=dict(arrowstyle='<->'))\n", " ax.annotate(\n", " '$2\\sigma$=%0.3f' % (2 * std), xy=(mean_coord[0], 0.70*ymax), xytext=(-25, -12),\n", " textcoords='offset points', va='center', color='k', size=12,\n", " bbox=dict(boxstyle='round', fc='w', ec='none', color='none', alpha=.7, lw=0))\n", " \n", " if ds030_score is not None:\n", " ds030_coord = draw_line(ds030_score, ax=ax, color='k', marker='o')\n", " ax.annotate(\n", " 'DS030', xy=ds030_coord, xytext=(-100, 0),\n", " textcoords='offset points', va='center', color='w', size=16,\n", " bbox=dict(boxstyle='round', fc=color, ec='none', color='none', lw=0),\n", " arrowprops=dict(\n", " arrowstyle='wedge,tail_width=0.8', lw=0, patchA=None, patchB=None,\n", " fc=color, ec='none', relpos=(0.5, 0.5)))\n", " \n", " \n", "def draw_line(score, ax=None, color='k', marker=None, lw=.7, extend=False):\n", " if ax is None:\n", " ax = plt.gca()\n", " \n", " if score > 1.0:\n", " score = 1.0\n", " \n", " coords = [score, -1]\n", " pdf_points = ax.lines[0].get_data()\n", " coords[1] = np.interp([coords[0]], pdf_points[0], pdf_points[1])\n", " \n", " if extend:\n", " ax.axvline(coords[0], ymin=coords[1] / ax.get_ylim()[1], ymax=0.75, color='gray', lw=.7)\n", " \n", " ax.axvline(coords[0], ymin=coords[1] / ax.get_ylim()[1], ymax=0, color=color, marker=marker, markevery=2,\n", " markeredgewidth=1.5, markerfacecolor='w', markeredgecolor=color, lw=lw)\n", "\n", " return coords" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sn.set(style=\"whitegrid\")\n", "\n", "fig = plt.figure(figsize=(20, 8)) \n", "ax1 = plt.subplot2grid((2,4), (0,0), colspan=2, rowspan=2)\n", "\n", "sn.violinplot(x='Classifier', y='AUC', hue='Split scheme', data=newdf, split=True,\n", " palette=['dodgerblue', 'darkorange'], ax=ax1)\n", "ax1.set_ylim([0.70, 1.0])\n", "ax1.set_ylabel('AUC')\n", "ax1.set_xlabel('Model')\n", "ax1.set_title('Model selection - Inner loop of nested cross-validation')\n", "\n", "ax2 = plt.subplot2grid((2,4), (0, 2))\n", "plot_cv_outer(kfold_outer, zscored=0, score='auc', ax=ax2, ds030_score=0.695, split_type='10-fold')\n", "ax2.set_xlabel('')\n", "ax2.legend()\n", "ax2.set_title('Evaluation - Outer loop of nested cross-validation')\n", "ax2.title.set_position([1.1, 1.0])\n", "\n", "ax3 = plt.subplot2grid((2,4), (1, 2))\n", "plot_cv_outer(loso_outer, zscored=0, score='auc', ax=ax3, ds030_score=0.695, color='darkorange', split_type='LoSo (17 folds)')\n", "ax3.legend()\n", "ax3.set_xlabel('AUC')\n", "\n", "ax4 = plt.subplot2grid((2,4), (0, 3))\n", "plot_cv_outer(kfold_outer, zscored=0, score='acc', ax=ax4, ds030_score=0.7283, split_type='10-fold')\n", "ax4.set_xlabel('')\n", "ax4.legend()\n", "\n", "ax5 = plt.subplot2grid((2,4), (1, 3))\n", "plot_cv_outer(loso_outer, zscored=0, score='acc', ax=ax5, ds030_score=0.7283, color='darkorange', split_type='LoSo (17 folds)')\n", "ax5.legend()\n", "ax5.set_xlabel('Accuracy')\n", "\n", "\n", "fig.savefig(op.join(out_path, 'crossvalidation.pdf'),\n", " bbox_inches='tight', pad_inches=0, dpi=300)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "zscoreddf = loso_outer.loc[loso_outer.zscored == 0, ['auc', 'acc', 'site']]\n", "palette = sn.color_palette(\"cubehelix\", len(set(zscoreddf.site)))\n", "sn.pairplot(zscoreddf.loc[zscoreddf.auc.notnull(), ['auc', 'acc', 'site']], hue='site', palette=palette)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sites = sorted(list(set(loso_outer.site.ravel().tolist())))\n", "palette = sn.color_palette(\"husl\", len(sites))\n", "fig = plt.figure()\n", "for i, site in enumerate(sites):\n", " sitedf = loso_outer.loc[loso_outer.site == site]\n", " accdf = sitedf.loc[sitedf.zscored==0]\n", " sn.distplot(accdf.acc.values.ravel(), bins=20, kde=0, label=site, color=palette[i])\n", "\n", "fig.gca().legend()\n", "fig.gca().set_xlim([0.5, 1.0])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }