{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%matplotlib inline\n", "%load_ext autoreload\n", "%autoreload 2\n", "import os.path as op\n", "import numpy as np\n", "import pandas as pd\n", "from mriqc.viz.misc import (\n", " raters_variability_plot, plot_abide_stripplots, plot_corrmat, plot_histograms, figure1, plot_batches\n", ")\n", "from pkg_resources import resource_filename as pkgrf\n", "from mriqc.classifier.data import read_dataset, combine_datasets\n", "import matplotlib.pyplot as plt\n", "from matplotlib.gridspec import GridSpec\n", "import seaborn as sn\n", "sn.set(style=\"whitegrid\")\n", "\n", "out_path = op.abspath('results')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "x_path = pkgrf('mriqc', 'data/csv/x_abide.csv')\n", "y_path = pkgrf('mriqc', 'data/csv/y_abide.csv')\n", "ds030_x_path = pkgrf('mriqc', 'data/csv/x_ds030.csv')\n", "ds030_y_path = pkgrf('mriqc', 'data/csv/y_ds030.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Testing sklearn's splits" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.model_selection import PredefinedSplit\n", "naive_x = np.array(list(range(20)))\n", "naive_y = np.zeros(15).tolist() + np.ones(5).tolist()\n", "\n", "ps = PredefinedSplit(test_fold=(np.array(naive_y) - 1).tolist())\n", "print(ps.get_n_splits(naive_x, naive_y))\n", "\n", "for train_index, test_index in ps.split():\n", " print(train_index)\n", " print(test_index)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from mriqc.classifier.sklearn._split import RepeatedPartiallyHeldOutKFold\n", "\n", "train_y = fulldata[['rater_1']].values.ravel()\n", "groups = (fulldata.site == 'UM').astype(int)\n", "sp = RepeatedPartiallyHeldOutKFold(n_splits=3).split(fulldata.values, y=train_y, groups=groups)\n", "\n", "for i, (train_index, test_index) in enumerate(sp):\n", "# print(len(train_index), len(test_index))\n", " y = train_y[test_index]\n", " unique_y, y_inversed = np.unique(y, return_inverse=True)\n", " y_counts = np.bincount(y_inversed)\n", "# print(\"Counts[%d]:\" % i, y_counts)\n", "\n", "from sklearn.model_selection import StratifiedKFold\n", "from mriqc.classifier.sklearn._split import RepeatedBalancedKFold\n", "\n", "train_y = fulldata[['rater_1']].values.ravel()\n", "sp = RepeatedBalancedKFold(n_splits=10, n_repeats=5).split(fulldata[coi], y=train_y)\n", "\n", "for i, (train_index, test_index) in enumerate(sp):\n", " y = train_y[test_index]\n", " unique_y, y_inversed = np.unique(y, return_inverse=True)\n", " y_counts = np.bincount(y_inversed)\n", " print(\"Counts[%d]:\" % i, y_counts)\n", " \n", "from mriqc.classifier.sklearn._split import RobustLeavePGroupsOut\n", "\n", "sites = fulldata[['site']].values.ravel().tolist()\n", "sitenames = list(set(sites))\n", "groups = [sitenames.index(s) for s in sites]\n", "cv = RobustLeavePGroupsOut(n_groups=1)\n", "cvs = cv.split(fulldata[coi], y=fulldata[['rater_1']].values.ravel().tolist(), groups=groups)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Testing new preprocessing steps" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from mriqc.classifier.sklearn import preprocessing as mcsp\n", "select = mcsp.SiteCorrelationSelector()\n", "selected = select.fit_transform(scaled[features + ['site']].values, None)\n", "try1 = scaled[features + ['site']].columns[select.mask_].ravel().tolist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "print('Removed %s:\\n' % sorted(list(set(features) - set(try1))))\n", "print('Kept %s:\\n' % sorted(try1))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "print(try1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from mriqc.classifier.sklearn import preprocessing as mcsp\n", "\n", "select = mcsp.CustFsNoiseWinnow()\n", "selected = select.fit_transform(fulldata[features].values, fulldata.rater_1.values)\n", "try1 = fulldata[features].columns[select.mask_].ravel().tolist()\n", "\n", "select2 = mcsp.CustFsNoiseWinnow()\n", "selected2 = select2.fit_transform(fulldata[features].values, fulldata.rater_1.values)\n", "try2 = fulldata[features].columns[select.mask_].ravel().tolist()\n", "\n", "select3 = mcsp.CustFsNoiseWinnow()\n", "selected3 = select3.fit_transform(fulldata[features].values, fulldata.rater_1.values)\n", "try3 = fulldata[features].columns[select.mask_].ravel().tolist()\n", "\n", "select4 = mcsp.CustFsNoiseWinnow()\n", "selected4 = select4.fit_transform(fulldata[features].values, fulldata.rater_1.values)\n", "try4 = fulldata[features].columns[select.mask_].ravel().tolist()\n", "\n", "intersection = set(try1) & set(try2) & set(try3) & set(try4)\n", "print(list(sorted(intersection)))\n", "print(len(intersection))\n", "print(list(sorted(set(features) - intersection)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.preprocessing import LabelBinarizer\n", "from sklearn.metrics import roc_auc_score\n", "from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor\n", "from sklearn.model_selection import train_test_split\n", "\n", "selmask = fulldata[features].columns.isin(intersection)\n", "selmask = np.zeros(len(features), dtype=bool)\n", "selmask[[0, 1, 2, 10]] = True\n", "\n", "X = fulldata[features].values\n", "binarize = LabelBinarizer()\n", "y = binarize.fit_transform(fulldata.site.values.ravel().tolist())\n", "\n", "\n", "\n", "clf = ExtraTreesClassifier(\n", " n_estimators=1000,\n", " criterion='gini',\n", " max_depth=None,\n", " min_samples_split=2,\n", " min_samples_leaf=1,\n", " min_weight_fraction_leaf=0.0,\n", " max_features='sqrt',\n", " max_leaf_nodes=None,\n", " min_impurity_split=1e-07,\n", " bootstrap=True,\n", " oob_score=False,\n", " n_jobs=8,\n", " random_state=None,\n", " verbose=0,\n", " warm_start=False,\n", " class_weight='balanced'\n", ")\n", "\n", "clf = clf.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "print(roc_auc_score(y_test, clf.predict(X_test), average='macro', sample_weight=None))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "clf.feature_importances_" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.preprocessing import RobustScaler\n", "from mriqc.classifier.sklearn import preprocessing as mcsp\n", "\n", "scaler = mcsp.BatchScaler(RobustScaler(with_scaling=False), columns=coi)\n", "scaled = scaler.fit_transform(fulldata)\n", "fig = plot_batches(scaled[coi], excl_columns=['rater_1'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "# del fulldata['provenance']\n", "clf = Pipeline([\n", " ('std', mcsp.BatchScaler(RobustScaler(), groups='site', columns=coi)), \n", " ('feature_selection', mcsp.CustFsNoiseWinnow(features=features)),\n", "])\n", "\n", "clf.fit(fulldata, fulldata[['rater_1']].values.ravel())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "mit_csv = '/home/oesteban/mriqc/mit-satra/T1-mit.csv'\n", "abide_csv = op.join(data_path, 'runs/20170505_0.9.3-2017-04-23-2ba2c2e40c39/T1w.csv')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "mit_df = pd.read_csv(mit_csv, index_col=False, dtype={'subject_id': object})\n", "abide_df, pp_cols = read_dataset(abide_csv, op.join(data_path, 'ABIDE_QC_all.csv'), rate_label='rater_1')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "mit_df['rater'] = [1] * len(mit_df)\n", "mit_df['site'] = ['MIT'] * len(mit_df)\n", "abide_df['rater'] = [0] * len(abide_df)\n", "\n", "del abide_df['rater_1']\n", "mdata = pd.concat([abide_df, mit_df], axis=0)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "zscored = mcsp.BatchRobustScaler(by='site', columns=coi).fit_transform(mdata)\n", "\n", "colnames = [col for col in sorted(pp_cols)\n", " if not (col.startswith('spacing') or col.startswith('summary') or col.startswith('size'))]\n", "\n", "nrows = len(colnames)\n", "# palette = ['dodgerblue', 'darkorange']\n", "\n", "fig = plt.figure(figsize=(18, 2 * nrows))\n", "gs = GridSpec(nrows, 2, hspace=0.2)\n", "\n", "for i, col in enumerate(sorted(colnames)):\n", " ax_nzs = plt.subplot(gs[i, 0])\n", " ax_zsd = plt.subplot(gs[i, 1])\n", "\n", " sn.distplot(mdata.loc[(mdata.rater == 0), col], norm_hist=False,\n", " label='ABIDE', ax=ax_nzs, color='dodgerblue')\n", " sn.distplot(mdata.loc[(mdata.rater == 1), col], norm_hist=False,\n", " label='MIT', ax=ax_nzs, color='darkorange')\n", " ax_nzs.legend()\n", "\n", " sn.distplot(zscored.loc[(zscored.rater == 0), col], norm_hist=False,\n", " label='ABIDE', ax=ax_zsd, color='dodgerblue')\n", " sn.distplot(zscored.loc[(zscored.rater == 1), col], norm_hist=False,\n", " label='MIT', ax=ax_zsd, color='darkorange')\n", "\n", " alldata = mdata[[col]].values.ravel().tolist()\n", " minv = np.percentile(alldata, 0.2)\n", " maxv = np.percentile(alldata, 99.8)\n", " ax_nzs.set_xlim([minv, maxv])\n", "\n", " alldata = zscored[[col]].values.ravel().tolist()\n", " minv = np.percentile(alldata, 0.2)\n", " maxv = np.percentile(alldata, 99.8)\n", " ax_zsd.set_xlim([minv, maxv])\n", " \n", " ax_zsd.set_ylabel(col)\n", "fig.savefig('/home/oesteban/tmp/mriqc-ml-tests-2/histograms-mit.svg', format='svg', pad_inches=0, dpi=100)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "abide_df, pp_cols = read_dataset(abide_csv, op.join(data_path, 'ABIDE_QC_all.csv'), rate_label='rater_1')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "accept = abide_df[abide_df.rater_1 == 0]\n", "exclude = abide_df[abide_df.rater_1 == 1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "mit_df = pd.read_csv(mit_csv, index_col=False, dtype={'subject_id': object})\n", "\n", "means = {}\n", "for i, col in enumerate(sorted(colnames)):\n", " means[col] = np.median(accept[[col]].values)\n", " mit_copy = mit_df.copy()\n", " mit_copy[[col]] = [means[col]] * len(mit_copy)\n", " \n", " mit_copy.to_csv('/home/oesteban/tmp/mriqc-ml-tests-2/mit_t1_%s.csv' % col, index=False)\n", " \n", " bad_m = np.median(exclude[[col]].values)\n", " print('%s: %f +- %f :: %f +- %f' % (col, means[col], accept[[col]].std(), bad_m, exclude[[col]].std()))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pred = pd.read_csv('/home/oesteban/tmp/mriqc-ml-tests-2/predicted_orig.csv', index_col=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "for i, col in enumerate(sorted(colnames)):\n", " pred[col] = pd.read_csv('/home/oesteban/tmp/mriqc-ml-tests-2/predicted_mit_t1_%s.csv' % col).prediction.values" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pred.to_csv('/home/oesteban/tmp/mriqc-ml-tests-2/predictions_wrt_iqms.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pred.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }