{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Analyzing replicability of connectivity-based multivariate BWAS on the Human Connectome Project dataset\n", "\n", "## Imports" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "ExecuteTime": { "end_time": "2021-08-03T20:04:15.431840Z", "start_time": "2021-08-03T20:04:14.753565Z" } }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sklearn.linear_model import Ridge\n", "from sklearn.svm import SVR\n", "from sklearn.model_selection import KFold, train_test_split, cross_val_predict, GridSearchCV\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.decomposition import PCA\n", "from joblib import Parallel, delayed\n", "from mlxtend.evaluate import permutation_test\n", "sns.set(rc={\"figure.figsize\":(4, 2)})\n", "sns.set_style(\"whitegrid\")" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Load HCP data\n", "\n", "We load functional network matrices (netmats) from the HCP1200-release, as published on connectomeDB: https://db.humanconnectome.org/\n", "Due to licensoing issues, data is not supplied with the repository, but can be downloaded from the ConnectomeDB.\n", "See [hcp_data/readme.md](hcp_data/readme.md) for more details." ] }, { "cell_type": "code", "execution_count": 81, "outputs": [ { "data": { "text/plain": " Release Acquisition Gender Age 3T_Full_MR_Compl T1_Count \\\nSubject \n100004 S900 Q06 M 22-25 False 0 \n100206 S900 Q11 M 26-30 True 1 \n100307 Q1 Q01 F 26-30 True 1 \n100408 Q3 Q03 M 31-35 True 1 \n100610 S900 Q08 M 26-30 True 2 \n... ... ... ... ... ... ... \n992774 Q2 Q02 M 31-35 True 2 \n993675 S900 Q09 F 26-30 True 2 \n994273 S500 Q06 M 26-30 True 1 \n995174 S1200 Q13 M 22-25 False 1 \n996782 S900 Q08 F 26-30 True 2 \n\n T2_Count 3T_RS-fMRI_Count 3T_RS-fMRI_PctCompl 3T_Full_Task_fMRI \\\nSubject \n100004 0 0 0.0 False \n100206 1 4 100.0 True \n100307 1 4 100.0 True \n100408 1 4 100.0 True \n100610 1 4 100.0 True \n... ... ... ... ... \n992774 2 4 100.0 True \n993675 2 4 100.0 True \n994273 1 4 100.0 True \n995174 1 2 0.0 True \n996782 2 4 100.0 True \n\n ... Odor_Unadj Odor_AgeAdj PainIntens_RawScore PainInterf_Tscore \\\nSubject ... \n100004 ... 101.12 86.45 2.0 45.9 \n100206 ... 108.79 97.19 1.0 49.7 \n100307 ... 101.12 86.45 0.0 38.6 \n100408 ... 108.79 98.04 2.0 52.6 \n100610 ... 122.25 110.45 0.0 38.6 \n... ... ... ... ... ... \n992774 ... 122.25 111.41 4.0 50.1 \n993675 ... 122.25 110.45 0.0 38.6 \n994273 ... 122.25 111.41 7.0 63.8 \n995174 ... 88.61 64.58 3.0 50.1 \n996782 ... 108.79 97.19 0.0 38.6 \n\n Taste_Unadj Taste_AgeAdj Mars_Log_Score Mars_Errs Mars_Final \\\nSubject \n100004 107.17 105.31 1.80 0.0 1.80 \n100206 72.63 72.03 1.84 0.0 1.84 \n100307 71.69 71.76 1.76 0.0 1.76 \n100408 114.01 113.59 1.76 2.0 1.68 \n100610 84.84 85.31 1.92 1.0 1.88 \n... ... ... ... ... ... \n992774 107.17 103.55 1.76 0.0 1.76 \n993675 84.07 84.25 1.80 1.0 1.76 \n994273 110.65 109.73 1.80 1.0 1.76 \n995174 117.16 117.40 1.80 0.0 1.80 \n996782 75.43 73.72 1.84 0.0 1.84 \n\n age \nSubject \n100004 23.5 \n100206 28.0 \n100307 28.0 \n100408 33.0 \n100610 28.0 \n... ... \n992774 33.0 \n993675 28.0 \n994273 28.0 \n995174 23.5 \n996782 28.0 \n\n[1206 rows x 582 columns]", "text/html": "
| \n | Release | \nAcquisition | \nGender | \nAge | \n3T_Full_MR_Compl | \nT1_Count | \nT2_Count | \n3T_RS-fMRI_Count | \n3T_RS-fMRI_PctCompl | \n3T_Full_Task_fMRI | \n... | \nOdor_Unadj | \nOdor_AgeAdj | \nPainIntens_RawScore | \nPainInterf_Tscore | \nTaste_Unadj | \nTaste_AgeAdj | \nMars_Log_Score | \nMars_Errs | \nMars_Final | \nage | \n
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Subject | \n\n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n |
| 100004 | \nS900 | \nQ06 | \nM | \n22-25 | \nFalse | \n0 | \n0 | \n0 | \n0.0 | \nFalse | \n... | \n101.12 | \n86.45 | \n2.0 | \n45.9 | \n107.17 | \n105.31 | \n1.80 | \n0.0 | \n1.80 | \n23.5 | \n
| 100206 | \nS900 | \nQ11 | \nM | \n26-30 | \nTrue | \n1 | \n1 | \n4 | \n100.0 | \nTrue | \n... | \n108.79 | \n97.19 | \n1.0 | \n49.7 | \n72.63 | \n72.03 | \n1.84 | \n0.0 | \n1.84 | \n28.0 | \n
| 100307 | \nQ1 | \nQ01 | \nF | \n26-30 | \nTrue | \n1 | \n1 | \n4 | \n100.0 | \nTrue | \n... | \n101.12 | \n86.45 | \n0.0 | \n38.6 | \n71.69 | \n71.76 | \n1.76 | \n0.0 | \n1.76 | \n28.0 | \n
| 100408 | \nQ3 | \nQ03 | \nM | \n31-35 | \nTrue | \n1 | \n1 | \n4 | \n100.0 | \nTrue | \n... | \n108.79 | \n98.04 | \n2.0 | \n52.6 | \n114.01 | \n113.59 | \n1.76 | \n2.0 | \n1.68 | \n33.0 | \n
| 100610 | \nS900 | \nQ08 | \nM | \n26-30 | \nTrue | \n2 | \n1 | \n4 | \n100.0 | \nTrue | \n... | \n122.25 | \n110.45 | \n0.0 | \n38.6 | \n84.84 | \n85.31 | \n1.92 | \n1.0 | \n1.88 | \n28.0 | \n
| ... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
| 992774 | \nQ2 | \nQ02 | \nM | \n31-35 | \nTrue | \n2 | \n2 | \n4 | \n100.0 | \nTrue | \n... | \n122.25 | \n111.41 | \n4.0 | \n50.1 | \n107.17 | \n103.55 | \n1.76 | \n0.0 | \n1.76 | \n33.0 | \n
| 993675 | \nS900 | \nQ09 | \nF | \n26-30 | \nTrue | \n2 | \n2 | \n4 | \n100.0 | \nTrue | \n... | \n122.25 | \n110.45 | \n0.0 | \n38.6 | \n84.07 | \n84.25 | \n1.80 | \n1.0 | \n1.76 | \n28.0 | \n
| 994273 | \nS500 | \nQ06 | \nM | \n26-30 | \nTrue | \n1 | \n1 | \n4 | \n100.0 | \nTrue | \n... | \n122.25 | \n111.41 | \n7.0 | \n63.8 | \n110.65 | \n109.73 | \n1.80 | \n1.0 | \n1.76 | \n28.0 | \n
| 995174 | \nS1200 | \nQ13 | \nM | \n22-25 | \nFalse | \n1 | \n1 | \n2 | \n0.0 | \nTrue | \n... | \n88.61 | \n64.58 | \n3.0 | \n50.1 | \n117.16 | \n117.40 | \n1.80 | \n0.0 | \n1.80 | \n23.5 | \n
| 996782 | \nS900 | \nQ08 | \nF | \n26-30 | \nTrue | \n2 | \n2 | \n4 | \n100.0 | \nTrue | \n... | \n108.79 | \n97.19 | \n0.0 | \n38.6 | \n75.43 | \n73.72 | \n1.84 | \n0.0 | \n1.84 | \n28.0 | \n
1206 rows × 582 columns
\n| \n | connectivity | \nmodel | \ntarget | \nn | \nr_discovery_cv | \nr_discovery_overfit | \nr_replication | \np_discovery_cv | \np_discovery_overfit | \np_replication | \n
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \nnetmats_parcor | \nPCA_SVR | \nage | \n50 | \n-0.353027 | \n0.774355 | \n0.204078 | \n0.997003 | \n0.000999 | \n0.091908 | \n
| 1 | \nnetmats_parcor | \nPCA_SVR | \nage | \n50 | \n0.081894 | \n0.913355 | \n0.360970 | \n0.275724 | \n0.000999 | \n0.001998 | \n
| 2 | \nnetmats_parcor | \nPCA_SVR | \nage | \n50 | \n-0.258248 | \n0.841646 | \n0.265158 | \n0.962038 | \n0.000999 | \n0.025974 | \n
| 3 | \nnetmats_parcor | \nPCA_SVR | \nage | \n50 | \n0.050106 | \n0.899457 | \n-0.206159 | \n0.374625 | \n0.000999 | \n0.923077 | \n
| 4 | \nnetmats_parcor | \nPCA_SVR | \nage | \n50 | \n0.130115 | \n0.892774 | \n0.282873 | \n0.196803 | \n0.000999 | \n0.026973 | \n
| ... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
| 5995 | \nnetmats_pearson | \nPCA_SVR | \nPicSeq_AgeAdj | \n501 | \n0.125791 | \n0.360809 | \n0.062856 | \n0.004995 | \n0.000999 | \n0.085914 | \n
| 5996 | \nnetmats_pearson | \nPCA_SVR | \nPicSeq_AgeAdj | \n501 | \n0.060623 | \n0.352693 | \n0.142834 | \n0.085914 | \n0.000999 | \n0.000999 | \n
| 5997 | \nnetmats_pearson | \nPCA_SVR | \nPicSeq_AgeAdj | \n501 | \n0.084630 | \n0.373386 | \n0.163908 | \n0.028971 | \n0.000999 | \n0.000999 | \n
| 5998 | \nnetmats_pearson | \nPCA_SVR | \nPicSeq_AgeAdj | \n501 | \n0.071084 | \n0.376163 | \n0.109268 | \n0.065934 | \n0.000999 | \n0.012987 | \n
| 5999 | \nnetmats_pearson | \nPCA_SVR | \nPicSeq_AgeAdj | \n501 | \n0.015929 | \n0.308118 | \n0.032076 | \n0.347652 | \n0.000999 | \n0.225774 | \n
6000 rows × 10 columns
\n| \n | connectivity | \nmodel | \ntarget | \nn | \nr_discovery_cv | \nr_discovery_overfit | \nr_replication | \np_discovery_cv | \np_discovery_overfit | \np_replication | \n
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \nnetmats_parcor | \nridge | \nage | \n50 | \n-0.048518 | \n1.0 | \n0.404661 | \n0.629371 | \n0.000999 | \n0.002997 | \n
| 1 | \nnetmats_parcor | \nridge | \nage | \n50 | \n0.111210 | \n1.0 | \n0.281041 | \n0.210789 | \n0.000999 | \n0.026973 | \n
| 2 | \nnetmats_parcor | \nridge | \nage | \n50 | \n0.103342 | \n1.0 | \n0.107181 | \n0.227772 | \n0.000999 | \n0.216783 | \n
| 3 | \nnetmats_parcor | \nridge | \nage | \n50 | \n0.255112 | \n1.0 | \n-0.203981 | \n0.047952 | \n0.000999 | \n0.909091 | \n
| 4 | \nnetmats_parcor | \nridge | \nage | \n50 | \n0.425614 | \n1.0 | \n0.385665 | \n0.001998 | \n0.000999 | \n0.004995 | \n
| ... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
| 5995 | \nnetmats_pearson | \nridge | \nPicSeq_AgeAdj | \n501 | \n0.209622 | \n1.0 | \n0.217841 | \n0.000999 | \n0.000999 | \n0.000999 | \n
| 5996 | \nnetmats_pearson | \nridge | \nPicSeq_AgeAdj | \n501 | \n0.237770 | \n1.0 | \n0.201980 | \n0.000999 | \n0.000999 | \n0.000999 | \n
| 5997 | \nnetmats_pearson | \nridge | \nPicSeq_AgeAdj | \n501 | \n0.268639 | \n1.0 | \n0.203847 | \n0.000999 | \n0.000999 | \n0.000999 | \n
| 5998 | \nnetmats_pearson | \nridge | \nPicSeq_AgeAdj | \n501 | \n0.173644 | \n1.0 | \n0.195714 | \n0.001998 | \n0.000999 | \n0.000999 | \n
| 5999 | \nnetmats_pearson | \nridge | \nPicSeq_AgeAdj | \n501 | \n0.148498 | \n1.0 | \n0.204881 | \n0.001998 | \n0.000999 | \n0.000999 | \n
6000 rows × 10 columns
\n" }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "random_state = 42\n", "n_bootstrap = 100\n", "\n", "features = {\n", " 'netmats_parcor': netmats_parcor,\n", " 'netmats_pearson': netmats_pearson\n", "}\n", "\n", "models = {\n", " 'ridge': Ridge()\n", "}\n", "\n", "# We aggregate all results here:\n", "df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])\n", "\n", "for feature_set in features:\n", " for model in models:\n", " for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:\n", " for sample_size in [50, 100, 200, 300, 'max']:\n", "\n", " print('*****************************************************************')\n", " print(feature_set, model, target_var, sample_size)\n", "\n", " X, y = create_data(target=target_var, feature_data=features[feature_set])\n", "\n", " if sample_size=='max':\n", " sample_size = int(len(y)/2)\n", "\n", " # create random seeds for each bootstrap iteration for reproducibility\n", " rng = np.random.default_rng(random_state)\n", " random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)\n", "\n", " # run bootstrap iterations in parallel\n", " r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(\n", " *Parallel(n_jobs=-1)(\n", " delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))\n", "\n", " tmp_data_frame = pd.DataFrame({\n", " 'connectivity' : feature_set,\n", " 'model' : model,\n", " 'target' : target_var,\n", " 'n' : sample_size,\n", " 'r_discovery_cv': r_discovery_cv,\n", " 'r_discovery_overfit': r_discovery_overfit,\n", " 'r_replication': r_replication,\n", " 'p_discovery_cv': p_discovery_cv,\n", " 'p_discovery_overfit': p_discovery_overfit,\n", " 'p_replication': p_replication\n", " })\n", " #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)\n", " #plt.ylabel('in-sample (r)')\n", " #plt.xlabel('out-of-sample (r_pred)')\n", " #plt.show()\n", " print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())\n", "\n", " for alpha in [0.05, 0.01, 0.005, 0.001]:\n", " print('Replicability at alpha =', alpha, ':',\n", " (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']| \n | connectivity | \nmodel | \ntarget | \nn | \nr_discovery_cv | \nr_discovery_overfit | \nr_replication | \np_discovery_cv | \np_discovery_overfit | \np_replication | \n
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \nnetmats_parcor | \nPCA_SVR | \nNone | \n50 | \n0.019118 | \n0.893737 | \n0.062928 | \n0.445554 | \n0.000999 | \n0.330669 | \n
| 1 | \nnetmats_parcor | \nPCA_SVR | \nNone | \n50 | \n-0.319547 | \n0.897756 | \n0.134277 | \n0.995005 | \n0.000999 | \n0.185814 | \n
| 2 | \nnetmats_parcor | \nPCA_SVR | \nNone | \n50 | \n-0.120632 | \n0.871983 | \n0.052140 | \n0.816184 | \n0.000999 | \n0.344655 | \n
| 3 | \nnetmats_parcor | \nPCA_SVR | \nNone | \n50 | \n-0.224169 | \n0.814265 | \n-0.009114 | \n0.938062 | \n0.000999 | \n0.536464 | \n
| 4 | \nnetmats_parcor | \nPCA_SVR | \nNone | \n50 | \n-0.175718 | \n0.889370 | \n0.083803 | \n0.876124 | \n0.000999 | \n0.279720 | \n
| ... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
| 495 | \nnetmats_parcor | \nPCA_SVR | \nNone | \n501 | \n-0.071740 | \n0.883689 | \n0.026235 | \n0.950050 | \n0.000999 | \n0.282717 | \n
| 496 | \nnetmats_parcor | \nPCA_SVR | \nNone | \n501 | \n0.009939 | \n0.877526 | \n0.019956 | \n0.414585 | \n0.000999 | \n0.338661 | \n
| 497 | \nnetmats_parcor | \nPCA_SVR | \nNone | \n501 | \n-0.009945 | \n0.875216 | \n0.015069 | \n0.608392 | \n0.000999 | \n0.349650 | \n
| 498 | \nnetmats_parcor | \nPCA_SVR | \nNone | \n501 | \n-0.020732 | \n0.876886 | \n-0.082211 | \n0.686314 | \n0.000999 | \n0.969031 | \n
| 499 | \nnetmats_parcor | \nPCA_SVR | \nNone | \n501 | \n-0.058961 | \n0.881125 | \n0.014305 | \n0.914086 | \n0.000999 | \n0.374625 | \n
500 rows × 10 columns
\n" }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "random_state = 42\n", "n_bootstrap = 100\n", "\n", "features = {\n", " 'netmats_parcor': netmats_parcor\n", "}\n", "\n", "models = {\n", " 'PCA_SVR': Pipeline([('pca', PCA(n_components=0.5)),\n", " ('svr', SVR())])\n", "\n", "}\n", "\n", "# We aggregate all results here:\n", "df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])\n", "\n", "for feature_set in features:\n", " for model in models:\n", " for target_var in [None]:\n", " for sample_size in [50, 100, 200, 300, 'max']:\n", "\n", " print('*****************************************************************')\n", " print(feature_set, model, target_var, sample_size)\n", "\n", " X, y = create_data(target=target_var, feature_data=features[feature_set]) # gives a random y when target is None\n", "\n", " if sample_size=='max':\n", " sample_size = int(len(y)/2)\n", "\n", " # create random seeds for each bootstrap iteration for reproducibility\n", " rng = np.random.default_rng(random_state)\n", " random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)\n", "\n", " # run bootstrap iterations in parallel\n", " r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(\n", " *Parallel(n_jobs=-1)(\n", " delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))\n", "\n", " tmp_data_frame = pd.DataFrame({\n", " 'connectivity' : feature_set,\n", " 'model' : model,\n", " 'target' : target_var,\n", " 'n' : sample_size,\n", " 'r_discovery_cv': r_discovery_cv,\n", " 'r_discovery_overfit': r_discovery_overfit,\n", " 'r_replication': r_replication,\n", " 'p_discovery_cv': p_discovery_cv,\n", " 'p_discovery_overfit': p_discovery_overfit,\n", " 'p_replication': p_replication\n", " })\n", " #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)\n", " #plt.ylabel('in-sample (r)')\n", " #plt.xlabel('out-of-sample (r_pred)')\n", " #plt.show()\n", " print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())\n", "\n", " for alpha in [0.05, 0.01, 0.005, 0.001]:\n", " print('Replicability at alpha =', alpha, ':',\n", " (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']| \n | connectivity | \nmodel | \ntarget | \nn | \nr_discovery_cv | \nr_discovery_overfit | \nr_replication | \np_discovery_cv | \np_discovery_overfit | \np_replication | \n
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \nnetmats_parcor | \nSVR | \nage | \n50 | \n-0.330564 | \n0.899135 | \n0.288430 | \n0.994006 | \n0.000999 | \n0.029970 | \n
| 1 | \nnetmats_parcor | \nSVR | \nage | \n50 | \n-0.023555 | \n0.963181 | \n0.291646 | \n0.591409 | \n0.000999 | \n0.019980 | \n
| 2 | \nnetmats_parcor | \nSVR | \nage | \n50 | \n-0.176967 | \n0.968481 | \n0.092822 | \n0.884116 | \n0.000999 | \n0.244755 | \n
| 3 | \nnetmats_parcor | \nSVR | \nage | \n50 | \n0.093535 | \n0.960486 | \n-0.161212 | \n0.299700 | \n0.000999 | \n0.854146 | \n
| 4 | \nnetmats_parcor | \nSVR | \nage | \n50 | \n0.220397 | \n0.958377 | \n0.310812 | \n0.072927 | \n0.000999 | \n0.016983 | \n
| ... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
| 5995 | \nnetmats_pearson | \nSVR | \nPicSeq_AgeAdj | \n501 | \n0.167868 | \n0.469294 | \n0.119368 | \n0.000999 | \n0.000999 | \n0.004995 | \n
| 5996 | \nnetmats_pearson | \nSVR | \nPicSeq_AgeAdj | \n501 | \n0.134985 | \n0.493440 | \n0.222072 | \n0.001998 | \n0.000999 | \n0.000999 | \n
| 5997 | \nnetmats_pearson | \nSVR | \nPicSeq_AgeAdj | \n501 | \n0.155819 | \n0.522091 | \n0.223508 | \n0.000999 | \n0.000999 | \n0.000999 | \n
| 5998 | \nnetmats_pearson | \nSVR | \nPicSeq_AgeAdj | \n501 | \n0.065164 | \n0.484062 | \n0.215828 | \n0.081918 | \n0.000999 | \n0.000999 | \n
| 5999 | \nnetmats_pearson | \nSVR | \nPicSeq_AgeAdj | \n501 | \n0.044413 | \n0.398393 | \n0.100157 | \n0.139860 | \n0.000999 | \n0.018981 | \n
6000 rows × 10 columns
\n| \n | connectivity | \nmodel | \ntarget | \nn | \nr_discovery_cv | \nr_discovery_overfit | \nr_replication | \np_discovery_cv | \np_discovery_overfit | \np_replication | \n
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \nnetmats_parcor | \nPCA_Ridge | \nage | \n50 | \n-0.260636 | \n0.457983 | \n0.225977 | \n0.969031 | \n0.001998 | \n0.062937 | \n
| 1 | \nnetmats_parcor | \nPCA_Ridge | \nage | \n50 | \n0.189752 | \n0.687611 | \n0.246691 | \n0.098901 | \n0.000999 | \n0.049950 | \n
| 2 | \nnetmats_parcor | \nPCA_Ridge | \nage | \n50 | \n-0.014329 | \n0.730443 | \n-0.045666 | \n0.538462 | \n0.000999 | \n0.601399 | \n
| 3 | \nnetmats_parcor | \nPCA_Ridge | \nage | \n50 | \n0.194567 | \n0.710963 | \n-0.162378 | \n0.101898 | \n0.000999 | \n0.861139 | \n
| 4 | \nnetmats_parcor | \nPCA_Ridge | \nage | \n50 | \n0.223428 | \n0.717670 | \n0.304525 | \n0.075924 | \n0.000999 | \n0.015984 | \n
| ... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
| 5995 | \nnetmats_pearson | \nPCA_Ridge | \nPicSeq_AgeAdj | \n501 | \n0.174858 | \n0.286206 | \n0.086955 | \n0.000999 | \n0.000999 | \n0.029970 | \n
| 5996 | \nnetmats_pearson | \nPCA_Ridge | \nPicSeq_AgeAdj | \n501 | \n0.114124 | \n0.261799 | \n0.146330 | \n0.006993 | \n0.000999 | \n0.000999 | \n
| 5997 | \nnetmats_pearson | \nPCA_Ridge | \nPicSeq_AgeAdj | \n501 | \n0.193151 | \n0.310800 | \n0.164787 | \n0.000999 | \n0.000999 | \n0.000999 | \n
| 5998 | \nnetmats_pearson | \nPCA_Ridge | \nPicSeq_AgeAdj | \n501 | \n0.144112 | \n0.285125 | \n0.095999 | \n0.002997 | \n0.000999 | \n0.018981 | \n
| 5999 | \nnetmats_pearson | \nPCA_Ridge | \nPicSeq_AgeAdj | \n501 | \n0.028526 | \n0.212140 | \n0.091388 | \n0.256743 | \n0.000999 | \n0.021978 | \n
6000 rows × 10 columns
\n" }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "random_state = 42\n", "n_bootstrap = 100\n", "\n", "features = {\n", " 'netmats_parcor': netmats_parcor,\n", " 'netmats_pearson': netmats_pearson\n", "}\n", "\n", "models = {\n", " 'PCA_Ridge': Pipeline([('pca', PCA(n_components=0.5)),\n", " ('ridge', Ridge())])\n", "\n", "}\n", "\n", "# We aggregate all results here:\n", "df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])\n", "\n", "for feature_set in features:\n", " for model in models:\n", " for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:\n", " for sample_size in [50, 100, 200, 300, 'max']:\n", "\n", " print('*****************************************************************')\n", " print(feature_set, model, target_var, sample_size)\n", "\n", " X, y = create_data(target=target_var, feature_data=features[feature_set])\n", "\n", " if sample_size=='max':\n", " sample_size = int(len(y)/2)\n", "\n", " # create random seeds for each bootstrap iteration for reproducibility\n", " rng = np.random.default_rng(random_state)\n", " random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)\n", "\n", " # run bootstrap iterations in parallel\n", " r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(\n", " *Parallel(n_jobs=-1)(\n", " delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))\n", "\n", " tmp_data_frame = pd.DataFrame({\n", " 'connectivity' : feature_set,\n", " 'model' : model,\n", " 'target' : target_var,\n", " 'n' : sample_size,\n", " 'r_discovery_cv': r_discovery_cv,\n", " 'r_discovery_overfit': r_discovery_overfit,\n", " 'r_replication': r_replication,\n", " 'p_discovery_cv': p_discovery_cv,\n", " 'p_discovery_overfit': p_discovery_overfit,\n", " 'p_replication': p_replication\n", " })\n", " #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)\n", " #plt.ylabel('in-sample (r)')\n", " #plt.xlabel('out-of-sample (r_pred)')\n", " #plt.show()\n", " print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())\n", "\n", " for alpha in [0.05, 0.01, 0.005, 0.001]:\n", " print('Replicability at alpha =', alpha, ':',\n", " (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']| \n | connectivity | \nmodel | \ntarget | \nn | \nr_discovery_cv | \nr_discovery_overfit | \nr_replication | \np_discovery_cv | \np_discovery_overfit | \np_replication | \n
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \nnetmats_parcor | \nKernelRidge | \nage | \n50 | \n-0.289530 | \n1.0 | \n0.097963 | \n0.981019 | \n0.000999 | \n0.253746 | \n
| 1 | \nnetmats_parcor | \nKernelRidge | \nage | \n50 | \n-0.418438 | \n1.0 | \n0.020128 | \n0.999001 | \n0.000999 | \n0.449550 | \n
| 2 | \nnetmats_parcor | \nKernelRidge | \nage | \n50 | \n-0.141016 | \n1.0 | \n-0.140765 | \n0.822178 | \n0.000999 | \n0.838162 | \n
| 3 | \nnetmats_parcor | \nKernelRidge | \nage | \n50 | \n0.131642 | \n1.0 | \n-0.102626 | \n0.183816 | \n0.000999 | \n0.760240 | \n
| 4 | \nnetmats_parcor | \nKernelRidge | \nage | \n50 | \n0.024683 | \n1.0 | \n0.203918 | \n0.426573 | \n0.000999 | \n0.079920 | \n
| ... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
| 5995 | \nnetmats_pearson | \nKernelRidge | \nPicSeq_AgeAdj | \n501 | \n0.202720 | \n1.0 | \n0.225979 | \n0.000999 | \n0.000999 | \n0.000999 | \n
| 5996 | \nnetmats_pearson | \nKernelRidge | \nPicSeq_AgeAdj | \n501 | \n0.223386 | \n1.0 | \n0.217658 | \n0.000999 | \n0.000999 | \n0.000999 | \n
| 5997 | \nnetmats_pearson | \nKernelRidge | \nPicSeq_AgeAdj | \n501 | \n0.274774 | \n1.0 | \n0.206499 | \n0.000999 | \n0.000999 | \n0.000999 | \n
| 5998 | \nnetmats_pearson | \nKernelRidge | \nPicSeq_AgeAdj | \n501 | \n0.166591 | \n1.0 | \n0.203236 | \n0.000999 | \n0.000999 | \n0.000999 | \n
| 5999 | \nnetmats_pearson | \nKernelRidge | \nPicSeq_AgeAdj | \n501 | \n0.142675 | \n1.0 | \n0.217554 | \n0.001998 | \n0.000999 | \n0.000999 | \n
6000 rows × 10 columns
\n" }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "from sklearn.kernel_ridge import KernelRidge\n", "\n", "random_state = 42\n", "n_bootstrap = 100\n", "\n", "features = {\n", " 'netmats_parcor': netmats_parcor,\n", " 'netmats_pearson': netmats_pearson\n", "}\n", "\n", "models = {\n", " 'KernelRidge': KernelRidge()\n", "\n", "}\n", "\n", "# We aggregate all results here:\n", "df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])\n", "\n", "for feature_set in features:\n", " for model in models:\n", " for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:\n", " for sample_size in [50, 100, 200, 300, 'max']:\n", "\n", " print('*****************************************************************')\n", " print(feature_set, model, target_var, sample_size)\n", "\n", " X, y = create_data(target=target_var, feature_data=features[feature_set])\n", "\n", " if sample_size=='max':\n", " sample_size = int(len(y)/2)\n", "\n", " # create random seeds for each bootstrap iteration for reproducibility\n", " rng = np.random.default_rng(random_state)\n", " random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)\n", "\n", " # run bootstrap iterations in parallel\n", " r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(\n", " *Parallel(n_jobs=-1)(\n", " delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))\n", "\n", " tmp_data_frame = pd.DataFrame({\n", " 'connectivity' : feature_set,\n", " 'model' : model,\n", " 'target' : target_var,\n", " 'n' : sample_size,\n", " 'r_discovery_cv': r_discovery_cv,\n", " 'r_discovery_overfit': r_discovery_overfit,\n", " 'r_replication': r_replication,\n", " 'p_discovery_cv': p_discovery_cv,\n", " 'p_discovery_overfit': p_discovery_overfit,\n", " 'p_replication': p_replication\n", " })\n", " #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)\n", " #plt.ylabel('in-sample (r)')\n", " #plt.xlabel('out-of-sample (r_pred)')\n", " #plt.show()\n", " print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())\n", "\n", " for alpha in [0.05, 0.01, 0.005, 0.001]:\n", " print('Replicability at alpha =', alpha, ':',\n", " (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']