{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Analyzing replicability of connectivity-based multivariate BWAS on the Human Connectome Project dataset\n", "\n", "## Imports" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "ExecuteTime": { "end_time": "2021-08-03T20:04:15.431840Z", "start_time": "2021-08-03T20:04:14.753565Z" } }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sklearn.linear_model import Ridge\n", "from sklearn.svm import SVR\n", "from sklearn.model_selection import KFold, train_test_split, cross_val_predict, GridSearchCV\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.decomposition import PCA\n", "from joblib import Parallel, delayed\n", "from mlxtend.evaluate import permutation_test\n", "sns.set(rc={\"figure.figsize\":(4, 2)})\n", "sns.set_style(\"whitegrid\")" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Load HCP data\n", "\n", "We load functional network matrices (netmats) from the HCP1200-release, as published on connectomeDB: https://db.humanconnectome.org/\n", "Due to licensoing issues, data is not supplied with the repository, but can be downloaded from the ConnectomeDB.\n", "See [hcp_data/readme.md](hcp_data/readme.md) for more details." ] }, { "cell_type": "code", "execution_count": 81, "outputs": [ { "data": { "text/plain": " Release Acquisition Gender Age 3T_Full_MR_Compl T1_Count \\\nSubject \n100004 S900 Q06 M 22-25 False 0 \n100206 S900 Q11 M 26-30 True 1 \n100307 Q1 Q01 F 26-30 True 1 \n100408 Q3 Q03 M 31-35 True 1 \n100610 S900 Q08 M 26-30 True 2 \n... ... ... ... ... ... ... \n992774 Q2 Q02 M 31-35 True 2 \n993675 S900 Q09 F 26-30 True 2 \n994273 S500 Q06 M 26-30 True 1 \n995174 S1200 Q13 M 22-25 False 1 \n996782 S900 Q08 F 26-30 True 2 \n\n T2_Count 3T_RS-fMRI_Count 3T_RS-fMRI_PctCompl 3T_Full_Task_fMRI \\\nSubject \n100004 0 0 0.0 False \n100206 1 4 100.0 True \n100307 1 4 100.0 True \n100408 1 4 100.0 True \n100610 1 4 100.0 True \n... ... ... ... ... \n992774 2 4 100.0 True \n993675 2 4 100.0 True \n994273 1 4 100.0 True \n995174 1 2 0.0 True \n996782 2 4 100.0 True \n\n ... Odor_Unadj Odor_AgeAdj PainIntens_RawScore PainInterf_Tscore \\\nSubject ... \n100004 ... 101.12 86.45 2.0 45.9 \n100206 ... 108.79 97.19 1.0 49.7 \n100307 ... 101.12 86.45 0.0 38.6 \n100408 ... 108.79 98.04 2.0 52.6 \n100610 ... 122.25 110.45 0.0 38.6 \n... ... ... ... ... ... \n992774 ... 122.25 111.41 4.0 50.1 \n993675 ... 122.25 110.45 0.0 38.6 \n994273 ... 122.25 111.41 7.0 63.8 \n995174 ... 88.61 64.58 3.0 50.1 \n996782 ... 108.79 97.19 0.0 38.6 \n\n Taste_Unadj Taste_AgeAdj Mars_Log_Score Mars_Errs Mars_Final \\\nSubject \n100004 107.17 105.31 1.80 0.0 1.80 \n100206 72.63 72.03 1.84 0.0 1.84 \n100307 71.69 71.76 1.76 0.0 1.76 \n100408 114.01 113.59 1.76 2.0 1.68 \n100610 84.84 85.31 1.92 1.0 1.88 \n... ... ... ... ... ... \n992774 107.17 103.55 1.76 0.0 1.76 \n993675 84.07 84.25 1.80 1.0 1.76 \n994273 110.65 109.73 1.80 1.0 1.76 \n995174 117.16 117.40 1.80 0.0 1.80 \n996782 75.43 73.72 1.84 0.0 1.84 \n\n age \nSubject \n100004 23.5 \n100206 28.0 \n100307 28.0 \n100408 33.0 \n100610 28.0 \n... ... \n992774 33.0 \n993675 28.0 \n994273 28.0 \n995174 23.5 \n996782 28.0 \n\n[1206 rows x 582 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ReleaseAcquisitionGenderAge3T_Full_MR_ComplT1_CountT2_Count3T_RS-fMRI_Count3T_RS-fMRI_PctCompl3T_Full_Task_fMRI...Odor_UnadjOdor_AgeAdjPainIntens_RawScorePainInterf_TscoreTaste_UnadjTaste_AgeAdjMars_Log_ScoreMars_ErrsMars_Finalage
Subject
100004S900Q06M22-25False0000.0False...101.1286.452.045.9107.17105.311.800.01.8023.5
100206S900Q11M26-30True114100.0True...108.7997.191.049.772.6372.031.840.01.8428.0
100307Q1Q01F26-30True114100.0True...101.1286.450.038.671.6971.761.760.01.7628.0
100408Q3Q03M31-35True114100.0True...108.7998.042.052.6114.01113.591.762.01.6833.0
100610S900Q08M26-30True214100.0True...122.25110.450.038.684.8485.311.921.01.8828.0
..................................................................
992774Q2Q02M31-35True224100.0True...122.25111.414.050.1107.17103.551.760.01.7633.0
993675S900Q09F26-30True224100.0True...122.25110.450.038.684.0784.251.801.01.7628.0
994273S500Q06M26-30True114100.0True...122.25111.417.063.8110.65109.731.801.01.7628.0
995174S1200Q13M22-25False1120.0True...88.6164.583.050.1117.16117.401.800.01.8023.5
996782S900Q08F26-30True224100.0True...108.7997.190.038.675.4373.721.840.01.8428.0
\n

1206 rows × 582 columns

\n
" }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# HCP data can be obtained from the connectomeDB\n", "# data is not part of this repository\n", "subjectIDs = pd.read_csv('hcp_data/subjectIDs.txt', header=None)\n", "\n", "netmats_pearson = pd.read_csv('hcp_data/netmats1_correlationZ.txt',\n", " sep=' ',\n", " header=None)\n", "netmats_pearson['ID'] = subjectIDs[0]\n", "netmats_pearson.set_index('ID', drop=True, inplace=True)\n", "\n", "\n", "netmats_parcor = pd.read_csv('hcp_data/netmats2_partial-correlation.txt',\n", " sep=' ',\n", " header=None)\n", "netmats_parcor['ID'] = subjectIDs[0]\n", "netmats_parcor.set_index('ID', drop=True, inplace=True)\n", "\n", "behavior = pd.read_csv('hcp_data/hcp1200_behavioral_data.csv')\n", "behavior = behavior.set_index('Subject', drop=True)\n", "\n", "# convert age to numeric\n", "age = []\n", "for s in behavior['Age']:\n", " if s == '36+':\n", " age.append(36)\n", " else:\n", " split = s.split(sep='-')\n", " age.append(np.mean((float(split[0]), float(split[1]))))\n", "\n", "behavior['age'] = age\n", "behavior" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "markdown", "source": [ "# Function to prepare target variable\n" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 98, "outputs": [], "source": [ "def create_data(target='CogTotalComp_AgeAdj', feature_data=netmats_parcor):\n", " # it's a good practice to use pandas for merging, messing up subject order can be painful\n", " features = feature_data.columns\n", " df = behavior\n", " df = df.merge(feature_data, left_index=True, right_index=True, how='left')\n", " if target:\n", " df = df.dropna(subset = [target] + features.values.tolist())\n", " y = df[target].values\n", " else:\n", " df = df.dropna(subset = features.values.tolist())\n", " rng = np.random.default_rng(42)\n", " y = rng.normal(0,1, len(df))\n", " X = df[features].values\n", " return X, y" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "markdown", "source": [ "# Function implementing a single bootstrap iteration\n", "\n", "We define a workhorse function which:\n", "- randomly samples the discovery and the replication datasets,\n", "- creates cross-validated estimates of predictive performance within the discovery sample\n", "- finalizes the model by fitting it to the whole discovery sample (overfits the discovery but not the replication sample)\n", "- use it to predict the replication sample" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 83, "outputs": [], "source": [ "def bootstrap_workhorse(X, y, sample_size, model, random_state):\n", "\n", " #create discovery and replication samples by random sampling from the whole dataset (without replacement)\n", " X_discovery, X_replication, y_discovery, y_replication = train_test_split(X, y, train_size=sample_size, test_size=sample_size, shuffle=True, random_state=random_state)\n", "\n", " # obtain cross-validated predictions in the discovery sample\n", " predicted_discovery_cv = cross_val_predict(estimator=model, X=X_discovery, y=y_discovery, cv=10, n_jobs=1)\n", " # correlation between the cross-validated predictions and observations in the discovery sample\n", " # this is the correct, unbiased estimate!\n", " r_disc_cv = np.corrcoef(predicted_discovery_cv, y_discovery)[0, 1]\n", " # finalize model by training it on the full discovery sample (without cross-validation)\n", " final_model = model.fit(X=X_discovery, y=y_discovery)\n", " # obtain predictions with the final model on the discovery sample, note that this model actually overfits this sample.\n", " # we do this only to demonstrate biased estimates\n", " predicted_discovery_overfit = final_model.predict(X=X_discovery)\n", " # here we obtain the biased effect size (r) estimates for demonstrational purposes\n", " r_disc_overfit = np.corrcoef(predicted_discovery_overfit, y_discovery)[0, 1]\n", "\n", " # We use the final model to predict the replication sample\n", " # This is correct (no overfitting here), the final model did not see this data during training\n", " predicted_replication = final_model.predict(X=X_replication)\n", " # we obtain the out-of-sample prediction performance estimates\n", " r_rep = np.corrcoef(predicted_replication, y_replication)[0, 1]\n", "\n", " # below we calculate permutation-based p-values for all three effect size estimates (in-sample unbiased, in-sample biased, out-of-sample)\n", " # (one sided tests, testing for positive correlation)\n", " p_disc_cv = permutation_test(predicted_discovery_cv, y_discovery, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)\n", " p_disc_overfit = permutation_test(predicted_discovery_overfit, y_discovery, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)\n", " p_rep = permutation_test(predicted_replication, y_replication, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)\n", " # return results\n", " return r_disc_cv, r_disc_overfit, r_rep, p_disc_cv, p_disc_overfit, p_rep" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "markdown", "source": [ "All set, now we start the analysis." ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "# Replicability with sample sizes n=50, 100, 200, 300 and max\n", "Here we train a few different models on 100 bootstrap samples.\n", "\n", "We aggregate the results of our workhorse function in `n_bootstrap`=100 bootstrap cases (run in parallel).\n", "\n", "The whole process is repeated for all sample sizes, fetaure_sets and target variables.\n", "\n", "## Here we test age and 5 cognitive variables, including 'cognitive ability' (the main target variable in the target paper)\n", "- age: age group of the participants\n", "- CogTotalComp_AgeAdj: total cognitive ability\n", "- PMAT24_A_CR, : Fluid Intelligence (Penn Progressive Matrices)\n", "- CardSort_AgeAdj: Executive Function/Cognitive Flexibility (Dimensional Change Card Sort)\n", "- Flanker_AgeAdj: Executive Function/Inhibition (Flanker Task)\n", "- PicSeq_AgeAdj: Episodic Memory (Picture Sequence Memory)" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "# Reproducing the PCA+SVR-based model from the target paper\n", "### Like in the target paper:\n", "- Both PCA and SVR are done inside the cross-validation\n", "- PCA reatains the firts k principal components that together explain 50% of the variance\n", "- scikit-learn makes sure that PCA is only fit for the training samples\n", "- both for the test sets (in the cross-validation) and the replication sample PCA is not re-fit, bt features are simply transformed with the already fit PCA" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 87, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "*****************************************************************\n", "netmats_parcor PCA_SVR age 50\n", "0.01897346554269831 0.18901378266057708\n", "Replicability at alpha = 0.05 : 57.14285714285714 %\n", "Replicability at alpha = 0.01 : 14.285714285714285 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR age 100\n", "0.18664851524294415 0.27347377929445293\n", "Replicability at alpha = 0.05 : 89.55223880597015 %\n", "Replicability at alpha = 0.01 : 40.298507462686565 %\n", "Replicability at alpha = 0.005 : 31.343283582089555 %\n", "Replicability at alpha = 0.001 : 11.940298507462686 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR age 200\n", "0.3121257047522771 0.36175079133098437\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 98.9795918367347 %\n", "Replicability at alpha = 0.005 : 95.91836734693877 %\n", "Replicability at alpha = 0.001 : 80.61224489795919 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR age 300\n", "0.36894676842215623 0.39438101148062005\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 100.0 %\n", "Replicability at alpha = 0.005 : 100.0 %\n", "Replicability at alpha = 0.001 : 100.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR age max\n", "0.4169914586721147 0.44371738829107926\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 100.0 %\n", "Replicability at alpha = 0.005 : 100.0 %\n", "Replicability at alpha = 0.001 : 100.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR CogTotalComp_AgeAdj 50\n", "-0.25822001000043204 0.14032170544200848\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR CogTotalComp_AgeAdj 100\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ ":60: RuntimeWarning: invalid value encountered in long_scalars\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "-0.039553381304751625 0.21262859874092135\n", "Replicability at alpha = 0.05 : 90.9090909090909 %\n", "Replicability at alpha = 0.01 : 45.45454545454545 %\n", "Replicability at alpha = 0.005 : 18.181818181818183 %\n", "Replicability at alpha = 0.001 : 9.090909090909092 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR CogTotalComp_AgeAdj 200\n", "0.14966202410710186 0.2713589974517626\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 61.29032258064516 %\n", "Replicability at alpha = 0.005 : 53.2258064516129 %\n", "Replicability at alpha = 0.001 : 27.419354838709676 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR CogTotalComp_AgeAdj 300\n", "0.23474749556135074 0.3101363215395622\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 94.73684210526315 %\n", "Replicability at alpha = 0.005 : 94.73684210526315 %\n", "Replicability at alpha = 0.001 : 76.84210526315789 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR CogTotalComp_AgeAdj max\n", "0.30682365741954354 0.34611199458096925\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 100.0 %\n", "Replicability at alpha = 0.005 : 100.0 %\n", "Replicability at alpha = 0.001 : 100.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR PMAT24_A_CR 50\n", "-0.10672071098435865 0.12106221070795271\n", "Replicability at alpha = 0.05 : 0.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR PMAT24_A_CR 100\n", "0.07540309996263198 0.16660957189128314\n", "Replicability at alpha = 0.05 : 58.06451612903226 %\n", "Replicability at alpha = 0.01 : 19.35483870967742 %\n", "Replicability at alpha = 0.005 : 3.225806451612903 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR PMAT24_A_CR 200\n", "0.1818194483607081 0.24330054856845362\n", "Replicability at alpha = 0.05 : 94.9367088607595 %\n", "Replicability at alpha = 0.01 : 62.0253164556962 %\n", "Replicability at alpha = 0.005 : 45.56962025316456 %\n", "Replicability at alpha = 0.001 : 20.253164556962027 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR PMAT24_A_CR 300\n", "0.23928719811339255 0.2836756329717983\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 96.93877551020408 %\n", "Replicability at alpha = 0.005 : 90.81632653061224 %\n", "Replicability at alpha = 0.001 : 75.51020408163265 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR PMAT24_A_CR max\n", "0.30016731017540577 0.3252736273978649\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 100.0 %\n", "Replicability at alpha = 0.005 : 100.0 %\n", "Replicability at alpha = 0.001 : 100.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR Flanker_AgeAdj 50\n", "-0.253682468352549 0.028663123664680764\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR Flanker_AgeAdj 100\n", "-0.12086835280130456 0.04446606264674254\n", "Replicability at alpha = 0.05 : 0.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR Flanker_AgeAdj 200\n", "-0.04070672898990409 0.05833533752802022\n", "Replicability at alpha = 0.05 : 33.33333333333333 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR Flanker_AgeAdj 300\n", "0.010939993075878267 0.07671340073350896\n", "Replicability at alpha = 0.05 : 26.666666666666668 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR Flanker_AgeAdj max\n", "0.07368415050153652 0.12250618953881152\n", "Replicability at alpha = 0.05 : 91.83673469387756 %\n", "Replicability at alpha = 0.01 : 26.53061224489796 %\n", "Replicability at alpha = 0.005 : 12.244897959183673 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR CardSort_AgeAdj 50\n", "-0.2347808005536518 0.06074334420456127\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR CardSort_AgeAdj 100\n", "-0.07762735321443918 0.08296991306991304\n", "Replicability at alpha = 0.05 : 33.33333333333333 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR CardSort_AgeAdj 200\n", "0.021921082391207124 0.10256141335871115\n", "Replicability at alpha = 0.05 : 35.294117647058826 %\n", "Replicability at alpha = 0.01 : 5.88235294117647 %\n", "Replicability at alpha = 0.005 : 5.88235294117647 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR CardSort_AgeAdj 300\n", "0.06747049534058164 0.12397123226254751\n", "Replicability at alpha = 0.05 : 55.26315789473685 %\n", "Replicability at alpha = 0.01 : 15.789473684210526 %\n", "Replicability at alpha = 0.005 : 5.263157894736842 %\n", "Replicability at alpha = 0.001 : 5.263157894736842 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR CardSort_AgeAdj max\n", "0.10973316806153383 0.1495503211206215\n", "Replicability at alpha = 0.05 : 96.1038961038961 %\n", "Replicability at alpha = 0.01 : 61.038961038961034 %\n", "Replicability at alpha = 0.005 : 37.66233766233766 %\n", "Replicability at alpha = 0.001 : 7.792207792207792 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR PicSeq_AgeAdj 50\n", "-0.2691222555349478 0.0751641945794097\n", "Replicability at alpha = 0.05 : 0.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR PicSeq_AgeAdj 100\n", "-0.12847580806431988 0.09875903711275212\n", "Replicability at alpha = 0.05 : 0.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR PicSeq_AgeAdj 200\n", "-0.007143177951446793 0.12405504834061054\n", "Replicability at alpha = 0.05 : 53.333333333333336 %\n", "Replicability at alpha = 0.01 : 13.333333333333334 %\n", "Replicability at alpha = 0.005 : 6.666666666666667 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR PicSeq_AgeAdj 300\n", "0.06577036071435835 0.1525445910946501\n", "Replicability at alpha = 0.05 : 76.92307692307693 %\n", "Replicability at alpha = 0.01 : 25.64102564102564 %\n", "Replicability at alpha = 0.005 : 10.256410256410255 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor PCA_SVR PicSeq_AgeAdj max\n", "0.13437963540657485 0.1853026050732639\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 80.0 %\n", "Replicability at alpha = 0.005 : 74.44444444444444 %\n", "Replicability at alpha = 0.001 : 40.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR age 50\n", "-0.12643179950265376 0.05507531631070878\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR age 100\n", "0.009725514224550343 0.07992169563215587\n", "Replicability at alpha = 0.05 : 10.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR age 200\n", "0.0758358453025459 0.1398647058199516\n", "Replicability at alpha = 0.05 : 64.70588235294117 %\n", "Replicability at alpha = 0.01 : 26.47058823529412 %\n", "Replicability at alpha = 0.005 : 11.76470588235294 %\n", "Replicability at alpha = 0.001 : 2.941176470588235 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR age 300\n", "0.13396259958675433 0.15810854528637436\n", "Replicability at alpha = 0.05 : 89.1891891891892 %\n", "Replicability at alpha = 0.01 : 40.54054054054054 %\n", "Replicability at alpha = 0.005 : 31.08108108108108 %\n", "Replicability at alpha = 0.001 : 12.162162162162163 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR age max\n", "0.16461627834119258 0.19084783442850575\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 85.56701030927834 %\n", "Replicability at alpha = 0.005 : 82.4742268041237 %\n", "Replicability at alpha = 0.001 : 56.70103092783505 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR CogTotalComp_AgeAdj 50\n", "-0.27360113131820085 0.07257458992180708\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR CogTotalComp_AgeAdj 100\n", "-0.11427316875393931 0.06349813361022945\n", "Replicability at alpha = 0.05 : 0.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR CogTotalComp_AgeAdj 200\n", "-0.0012383369231139204 0.1061294190223969\n", "Replicability at alpha = 0.05 : 25.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR CogTotalComp_AgeAdj 300\n", "0.06987992460411578 0.13372831832326426\n", "Replicability at alpha = 0.05 : 62.16216216216216 %\n", "Replicability at alpha = 0.01 : 18.91891891891892 %\n", "Replicability at alpha = 0.005 : 10.81081081081081 %\n", "Replicability at alpha = 0.001 : 2.7027027027027026 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR CogTotalComp_AgeAdj max\n", "0.1329320326293221 0.1759580210438174\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 72.82608695652173 %\n", "Replicability at alpha = 0.005 : 58.69565217391305 %\n", "Replicability at alpha = 0.001 : 29.347826086956523 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR PMAT24_A_CR 50\n", "-0.13058241645252677 0.03937358302729429\n", "Replicability at alpha = 0.05 : 0.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR PMAT24_A_CR 100\n", "-0.018219577369671586 0.05283547742503512\n", "Replicability at alpha = 0.05 : 18.75 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR PMAT24_A_CR 200\n", "0.04492628597688514 0.09991315410566606\n", "Replicability at alpha = 0.05 : 57.14285714285714 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR PMAT24_A_CR 300\n", "0.08929516196642572 0.14153398369488263\n", "Replicability at alpha = 0.05 : 78.72340425531915 %\n", "Replicability at alpha = 0.01 : 34.04255319148936 %\n", "Replicability at alpha = 0.005 : 27.659574468085108 %\n", "Replicability at alpha = 0.001 : 2.127659574468085 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR PMAT24_A_CR max\n", "0.15546061546042778 0.17931030150814908\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 81.91489361702128 %\n", "Replicability at alpha = 0.005 : 76.59574468085107 %\n", "Replicability at alpha = 0.001 : 42.5531914893617 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR Flanker_AgeAdj 50\n", "-0.20847896699960677 0.026446681706768983\n", "Replicability at alpha = 0.05 : 0.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR Flanker_AgeAdj 100\n", "-0.10129567054697304 0.037796860369966837\n", "Replicability at alpha = 0.05 : 0.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR Flanker_AgeAdj 200\n", "-0.025107592465321495 0.04785239767856528\n", "Replicability at alpha = 0.05 : 16.666666666666664 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR Flanker_AgeAdj 300\n", "0.023206206455162977 0.07393753936413099\n", "Replicability at alpha = 0.05 : 18.181818181818183 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR Flanker_AgeAdj max\n", "0.07027346759899196 0.10203836949356383\n", "Replicability at alpha = 0.05 : 61.224489795918366 %\n", "Replicability at alpha = 0.01 : 8.16326530612245 %\n", "Replicability at alpha = 0.005 : 4.081632653061225 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR CardSort_AgeAdj 50\n", "-0.18617695057292352 0.052864559935567525\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR CardSort_AgeAdj 100\n", "-0.05366816791298359 0.05511154998176417\n", "Replicability at alpha = 0.05 : 60.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR CardSort_AgeAdj 200\n", "0.01642082868254707 0.07325950038420452\n", "Replicability at alpha = 0.05 : 45.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR CardSort_AgeAdj 300\n", "0.042886628759160786 0.09085835147403382\n", "Replicability at alpha = 0.05 : 38.46153846153847 %\n", "Replicability at alpha = 0.01 : 3.8461538461538463 %\n", "Replicability at alpha = 0.005 : 3.8461538461538463 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR CardSort_AgeAdj max\n", "0.08332392314286416 0.10638876826754434\n", "Replicability at alpha = 0.05 : 71.66666666666667 %\n", "Replicability at alpha = 0.01 : 10.0 %\n", "Replicability at alpha = 0.005 : 5.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR PicSeq_AgeAdj 50\n", "-0.23345400407893277 0.03493508595660751\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR PicSeq_AgeAdj 100\n", "-0.10664785322193493 0.06147545482107202\n", "Replicability at alpha = 0.05 : 20.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR PicSeq_AgeAdj 200\n", "-0.02801783500947512 0.06749833856170852\n", "Replicability at alpha = 0.05 : 42.857142857142854 %\n", "Replicability at alpha = 0.01 : 14.285714285714285 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR PicSeq_AgeAdj 300\n", "0.013808903665158028 0.07458636299063129\n", "Replicability at alpha = 0.05 : 41.66666666666667 %\n", "Replicability at alpha = 0.01 : 8.333333333333332 %\n", "Replicability at alpha = 0.005 : 8.333333333333332 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson PCA_SVR PicSeq_AgeAdj max\n", "0.05753833941897148 0.10077547292317074\n", "Replicability at alpha = 0.05 : 56.81818181818182 %\n", "Replicability at alpha = 0.01 : 11.363636363636363 %\n", "Replicability at alpha = 0.005 : 2.272727272727273 %\n", "Replicability at alpha = 0.001 : 2.272727272727273 %\n", "CPU times: user 27.8 s, sys: 6.06 s, total: 33.9 s\n", "Wall time: 3h 7min 45s\n" ] }, { "data": { "text/plain": " connectivity model target n r_discovery_cv \\\n0 netmats_parcor PCA_SVR age 50 -0.353027 \n1 netmats_parcor PCA_SVR age 50 0.081894 \n2 netmats_parcor PCA_SVR age 50 -0.258248 \n3 netmats_parcor PCA_SVR age 50 0.050106 \n4 netmats_parcor PCA_SVR age 50 0.130115 \n... ... ... ... ... ... \n5995 netmats_pearson PCA_SVR PicSeq_AgeAdj 501 0.125791 \n5996 netmats_pearson PCA_SVR PicSeq_AgeAdj 501 0.060623 \n5997 netmats_pearson PCA_SVR PicSeq_AgeAdj 501 0.084630 \n5998 netmats_pearson PCA_SVR PicSeq_AgeAdj 501 0.071084 \n5999 netmats_pearson PCA_SVR PicSeq_AgeAdj 501 0.015929 \n\n r_discovery_overfit r_replication p_discovery_cv p_discovery_overfit \\\n0 0.774355 0.204078 0.997003 0.000999 \n1 0.913355 0.360970 0.275724 0.000999 \n2 0.841646 0.265158 0.962038 0.000999 \n3 0.899457 -0.206159 0.374625 0.000999 \n4 0.892774 0.282873 0.196803 0.000999 \n... ... ... ... ... \n5995 0.360809 0.062856 0.004995 0.000999 \n5996 0.352693 0.142834 0.085914 0.000999 \n5997 0.373386 0.163908 0.028971 0.000999 \n5998 0.376163 0.109268 0.065934 0.000999 \n5999 0.308118 0.032076 0.347652 0.000999 \n\n p_replication \n0 0.091908 \n1 0.001998 \n2 0.025974 \n3 0.923077 \n4 0.026973 \n... ... \n5995 0.085914 \n5996 0.000999 \n5997 0.000999 \n5998 0.012987 \n5999 0.225774 \n\n[6000 rows x 10 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
connectivitymodeltargetnr_discovery_cvr_discovery_overfitr_replicationp_discovery_cvp_discovery_overfitp_replication
0netmats_parcorPCA_SVRage50-0.3530270.7743550.2040780.9970030.0009990.091908
1netmats_parcorPCA_SVRage500.0818940.9133550.3609700.2757240.0009990.001998
2netmats_parcorPCA_SVRage50-0.2582480.8416460.2651580.9620380.0009990.025974
3netmats_parcorPCA_SVRage500.0501060.899457-0.2061590.3746250.0009990.923077
4netmats_parcorPCA_SVRage500.1301150.8927740.2828730.1968030.0009990.026973
.................................
5995netmats_pearsonPCA_SVRPicSeq_AgeAdj5010.1257910.3608090.0628560.0049950.0009990.085914
5996netmats_pearsonPCA_SVRPicSeq_AgeAdj5010.0606230.3526930.1428340.0859140.0009990.000999
5997netmats_pearsonPCA_SVRPicSeq_AgeAdj5010.0846300.3733860.1639080.0289710.0009990.000999
5998netmats_pearsonPCA_SVRPicSeq_AgeAdj5010.0710840.3761630.1092680.0659340.0009990.012987
5999netmats_pearsonPCA_SVRPicSeq_AgeAdj5010.0159290.3081180.0320760.3476520.0009990.225774
\n

6000 rows × 10 columns

\n
" }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "random_state = 42\n", "n_bootstrap = 100\n", "\n", "features = {\n", " 'netmats_parcor': netmats_parcor,\n", " 'netmats_pearson': netmats_pearson\n", "}\n", "\n", "models = {\n", " 'PCA_SVR': Pipeline([('pca', PCA(n_components=0.5)),\n", " ('svr', SVR())])\n", "\n", "}\n", "\n", "# We aggregate all results here:\n", "df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])\n", "\n", "for feature_set in features:\n", " for model in models:\n", " for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:\n", " for sample_size in [50, 100, 200, 300, 'max']:\n", "\n", " print('*****************************************************************')\n", " print(feature_set, model, target_var, sample_size)\n", "\n", " X, y = create_data(target=target_var, feature_data=features[feature_set])\n", "\n", " if sample_size=='max':\n", " sample_size = int(len(y)/2)\n", "\n", " # create random seeds for each bootstrap iteration for reproducibility\n", " rng = np.random.default_rng(random_state)\n", " random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)\n", "\n", " # run bootstrap iterations in parallel\n", " r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(\n", " *Parallel(n_jobs=-1)(\n", " delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))\n", "\n", " tmp_data_frame = pd.DataFrame({\n", " 'connectivity' : feature_set,\n", " 'model' : model,\n", " 'target' : target_var,\n", " 'n' : sample_size,\n", " 'r_discovery_cv': r_discovery_cv,\n", " 'r_discovery_overfit': r_discovery_overfit,\n", " 'r_replication': r_replication,\n", " 'p_discovery_cv': p_discovery_cv,\n", " 'p_discovery_overfit': p_discovery_overfit,\n", " 'p_replication': p_replication\n", " })\n", " #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)\n", " #plt.ylabel('in-sample (r)')\n", " #plt.xlabel('out-of-sample (r_pred)')\n", " #plt.show()\n", " print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())\n", "\n", " for alpha in [0.05, 0.01, 0.005, 0.001]:\n", " print('Replicability at alpha =', alpha, ':',\n", " (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
connectivitymodeltargetnr_discovery_cvr_discovery_overfitr_replicationp_discovery_cvp_discovery_overfitp_replication
0netmats_parcorridgeage50-0.0485181.00.4046610.6293710.0009990.002997
1netmats_parcorridgeage500.1112101.00.2810410.2107890.0009990.026973
2netmats_parcorridgeage500.1033421.00.1071810.2277720.0009990.216783
3netmats_parcorridgeage500.2551121.0-0.2039810.0479520.0009990.909091
4netmats_parcorridgeage500.4256141.00.3856650.0019980.0009990.004995
.................................
5995netmats_pearsonridgePicSeq_AgeAdj5010.2096221.00.2178410.0009990.0009990.000999
5996netmats_pearsonridgePicSeq_AgeAdj5010.2377701.00.2019800.0009990.0009990.000999
5997netmats_pearsonridgePicSeq_AgeAdj5010.2686391.00.2038470.0009990.0009990.000999
5998netmats_pearsonridgePicSeq_AgeAdj5010.1736441.00.1957140.0019980.0009990.000999
5999netmats_pearsonridgePicSeq_AgeAdj5010.1484981.00.2048810.0019980.0009990.000999
\n

6000 rows × 10 columns

\n" }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "random_state = 42\n", "n_bootstrap = 100\n", "\n", "features = {\n", " 'netmats_parcor': netmats_parcor,\n", " 'netmats_pearson': netmats_pearson\n", "}\n", "\n", "models = {\n", " 'ridge': Ridge()\n", "}\n", "\n", "# We aggregate all results here:\n", "df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])\n", "\n", "for feature_set in features:\n", " for model in models:\n", " for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:\n", " for sample_size in [50, 100, 200, 300, 'max']:\n", "\n", " print('*****************************************************************')\n", " print(feature_set, model, target_var, sample_size)\n", "\n", " X, y = create_data(target=target_var, feature_data=features[feature_set])\n", "\n", " if sample_size=='max':\n", " sample_size = int(len(y)/2)\n", "\n", " # create random seeds for each bootstrap iteration for reproducibility\n", " rng = np.random.default_rng(random_state)\n", " random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)\n", "\n", " # run bootstrap iterations in parallel\n", " r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(\n", " *Parallel(n_jobs=-1)(\n", " delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))\n", "\n", " tmp_data_frame = pd.DataFrame({\n", " 'connectivity' : feature_set,\n", " 'model' : model,\n", " 'target' : target_var,\n", " 'n' : sample_size,\n", " 'r_discovery_cv': r_discovery_cv,\n", " 'r_discovery_overfit': r_discovery_overfit,\n", " 'r_replication': r_replication,\n", " 'p_discovery_cv': p_discovery_cv,\n", " 'p_discovery_overfit': p_discovery_overfit,\n", " 'p_replication': p_replication\n", " })\n", " #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)\n", " #plt.ylabel('in-sample (r)')\n", " #plt.xlabel('out-of-sample (r_pred)')\n", " #plt.show()\n", " print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())\n", "\n", " for alpha in [0.05, 0.01, 0.005, 0.001]:\n", " print('Replicability at alpha =', alpha, ':',\n", " (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
connectivitymodeltargetnr_discovery_cvr_discovery_overfitr_replicationp_discovery_cvp_discovery_overfitp_replication
0netmats_parcorPCA_SVRNone500.0191180.8937370.0629280.4455540.0009990.330669
1netmats_parcorPCA_SVRNone50-0.3195470.8977560.1342770.9950050.0009990.185814
2netmats_parcorPCA_SVRNone50-0.1206320.8719830.0521400.8161840.0009990.344655
3netmats_parcorPCA_SVRNone50-0.2241690.814265-0.0091140.9380620.0009990.536464
4netmats_parcorPCA_SVRNone50-0.1757180.8893700.0838030.8761240.0009990.279720
.................................
495netmats_parcorPCA_SVRNone501-0.0717400.8836890.0262350.9500500.0009990.282717
496netmats_parcorPCA_SVRNone5010.0099390.8775260.0199560.4145850.0009990.338661
497netmats_parcorPCA_SVRNone501-0.0099450.8752160.0150690.6083920.0009990.349650
498netmats_parcorPCA_SVRNone501-0.0207320.876886-0.0822110.6863140.0009990.969031
499netmats_parcorPCA_SVRNone501-0.0589610.8811250.0143050.9140860.0009990.374625
\n

500 rows × 10 columns

\n" }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "random_state = 42\n", "n_bootstrap = 100\n", "\n", "features = {\n", " 'netmats_parcor': netmats_parcor\n", "}\n", "\n", "models = {\n", " 'PCA_SVR': Pipeline([('pca', PCA(n_components=0.5)),\n", " ('svr', SVR())])\n", "\n", "}\n", "\n", "# We aggregate all results here:\n", "df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])\n", "\n", "for feature_set in features:\n", " for model in models:\n", " for target_var in [None]:\n", " for sample_size in [50, 100, 200, 300, 'max']:\n", "\n", " print('*****************************************************************')\n", " print(feature_set, model, target_var, sample_size)\n", "\n", " X, y = create_data(target=target_var, feature_data=features[feature_set]) # gives a random y when target is None\n", "\n", " if sample_size=='max':\n", " sample_size = int(len(y)/2)\n", "\n", " # create random seeds for each bootstrap iteration for reproducibility\n", " rng = np.random.default_rng(random_state)\n", " random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)\n", "\n", " # run bootstrap iterations in parallel\n", " r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(\n", " *Parallel(n_jobs=-1)(\n", " delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))\n", "\n", " tmp_data_frame = pd.DataFrame({\n", " 'connectivity' : feature_set,\n", " 'model' : model,\n", " 'target' : target_var,\n", " 'n' : sample_size,\n", " 'r_discovery_cv': r_discovery_cv,\n", " 'r_discovery_overfit': r_discovery_overfit,\n", " 'r_replication': r_replication,\n", " 'p_discovery_cv': p_discovery_cv,\n", " 'p_discovery_overfit': p_discovery_overfit,\n", " 'p_replication': p_replication\n", " })\n", " #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)\n", " #plt.ylabel('in-sample (r)')\n", " #plt.xlabel('out-of-sample (r_pred)')\n", " #plt.show()\n", " print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())\n", "\n", " for alpha in [0.05, 0.01, 0.005, 0.001]:\n", " print('Replicability at alpha =', alpha, ':',\n", " (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']:58: RuntimeWarning: invalid value encountered in long_scalars\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "-0.14562995074992818 0.24088134739904876\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor SVR CogTotalComp_AgeAdj 200\n", "0.03603161832397271 0.2851671994147706\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 43.47826086956522 %\n", "Replicability at alpha = 0.005 : 34.78260869565217 %\n", "Replicability at alpha = 0.001 : 8.695652173913043 %\n", "*****************************************************************\n", "netmats_parcor SVR CogTotalComp_AgeAdj 300\n", "0.1468259280416723 0.309336370533758\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 71.23287671232876 %\n", "Replicability at alpha = 0.005 : 67.12328767123287 %\n", "Replicability at alpha = 0.001 : 49.31506849315068 %\n", "*****************************************************************\n", "netmats_parcor SVR CogTotalComp_AgeAdj max\n", "0.2549413117184034 0.33152667215407533\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 100.0 %\n", "Replicability at alpha = 0.005 : 100.0 %\n", "Replicability at alpha = 0.001 : 100.0 %\n", "*****************************************************************\n", "netmats_parcor SVR PMAT24_A_CR 50\n", "-0.22239623119009483 0.15643323642010132\n", "Replicability at alpha = 0.05 : 0.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor SVR PMAT24_A_CR 100\n", "-0.034263490784347574 0.1757279082959372\n", "Replicability at alpha = 0.05 : 22.22222222222222 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor SVR PMAT24_A_CR 200\n", "0.1264519179930518 0.24892720515585945\n", "Replicability at alpha = 0.05 : 92.15686274509804 %\n", "Replicability at alpha = 0.01 : 54.90196078431373 %\n", "Replicability at alpha = 0.005 : 35.294117647058826 %\n", "Replicability at alpha = 0.001 : 13.725490196078432 %\n", "*****************************************************************\n", "netmats_parcor SVR PMAT24_A_CR 300\n", "0.19804464630414487 0.2842987213429107\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 86.81318681318682 %\n", "Replicability at alpha = 0.005 : 82.41758241758241 %\n", "Replicability at alpha = 0.001 : 57.14285714285714 %\n", "*****************************************************************\n", "netmats_parcor SVR PMAT24_A_CR max\n", "0.281877972297132 0.32318498565606746\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 100.0 %\n", "Replicability at alpha = 0.005 : 100.0 %\n", "Replicability at alpha = 0.001 : 100.0 %\n", "*****************************************************************\n", "netmats_parcor SVR Flanker_AgeAdj 50\n", "-0.3204641069539859 0.030480831068828495\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_parcor SVR Flanker_AgeAdj 100\n", "-0.1890903926253441 0.06595568727430982\n", "Replicability at alpha = 0.05 : 0.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor SVR Flanker_AgeAdj 200\n", "-0.08732742503822419 0.08078012627412137\n", "Replicability at alpha = 0.05 : 0.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor SVR Flanker_AgeAdj 300\n", "-0.02181275209991224 0.09337973998758166\n", "Replicability at alpha = 0.05 : 66.66666666666666 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor SVR Flanker_AgeAdj max\n", "0.037791420905649346 0.13254327723076448\n", "Replicability at alpha = 0.05 : 88.88888888888889 %\n", "Replicability at alpha = 0.01 : 18.51851851851852 %\n", "Replicability at alpha = 0.005 : 7.4074074074074066 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor SVR CardSort_AgeAdj 50\n", "-0.31648999016482265 0.08034666460705671\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_parcor SVR CardSort_AgeAdj 100\n", "-0.1724700765060818 0.10139569898385202\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_parcor SVR CardSort_AgeAdj 200\n", "-0.03266879795717834 0.12273062020169488\n", "Replicability at alpha = 0.05 : 33.33333333333333 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor SVR CardSort_AgeAdj 300\n", "0.039912731726428986 0.14838044253474514\n", "Replicability at alpha = 0.05 : 64.28571428571429 %\n", "Replicability at alpha = 0.01 : 14.285714285714285 %\n", "Replicability at alpha = 0.005 : 10.714285714285714 %\n", "Replicability at alpha = 0.001 : 3.571428571428571 %\n", "*****************************************************************\n", "netmats_parcor SVR CardSort_AgeAdj max\n", "0.10304816152320781 0.16557288741091614\n", "Replicability at alpha = 0.05 : 98.59154929577466 %\n", "Replicability at alpha = 0.01 : 59.154929577464785 %\n", "Replicability at alpha = 0.005 : 40.845070422535215 %\n", "Replicability at alpha = 0.001 : 2.8169014084507045 %\n", "*****************************************************************\n", "netmats_parcor SVR PicSeq_AgeAdj 50\n", "-0.3208113579952884 0.08193465752885971\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_parcor SVR PicSeq_AgeAdj 100\n", "-0.19464008504119243 0.11223560776725991\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_parcor SVR PicSeq_AgeAdj 200\n", "-0.08624314038157287 0.142170250432876\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 100.0 %\n", "Replicability at alpha = 0.005 : 50.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor SVR PicSeq_AgeAdj 300\n", "0.0027082380419047036 0.16087552055866783\n", "Replicability at alpha = 0.05 : 83.33333333333334 %\n", "Replicability at alpha = 0.01 : 25.0 %\n", "Replicability at alpha = 0.005 : 8.333333333333332 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_parcor SVR PicSeq_AgeAdj max\n", "0.0888093961587751 0.186970438550117\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 61.53846153846154 %\n", "Replicability at alpha = 0.005 : 40.0 %\n", "Replicability at alpha = 0.001 : 7.6923076923076925 %\n", "*****************************************************************\n", "netmats_pearson SVR age 50\n", "-0.07578059047514771 0.11307723629924883\n", "Replicability at alpha = 0.05 : 33.33333333333333 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson SVR age 100\n", "0.10117091818899272 0.1919678284018117\n", "Replicability at alpha = 0.05 : 60.60606060606061 %\n", "Replicability at alpha = 0.01 : 15.151515151515152 %\n", "Replicability at alpha = 0.005 : 12.121212121212121 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson SVR age 200\n", "0.23100402523093122 0.30911095241403364\n", "Replicability at alpha = 0.05 : 98.86363636363636 %\n", "Replicability at alpha = 0.01 : 82.95454545454545 %\n", "Replicability at alpha = 0.005 : 70.45454545454545 %\n", "Replicability at alpha = 0.001 : 50.0 %\n", "*****************************************************************\n", "netmats_pearson SVR age 300\n", "0.31953483250166514 0.34442934241771356\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 100.0 %\n", "Replicability at alpha = 0.005 : 100.0 %\n", "Replicability at alpha = 0.001 : 97.0 %\n", "*****************************************************************\n", "netmats_pearson SVR age max\n", "0.4010174468125596 0.42731662316926716\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 100.0 %\n", "Replicability at alpha = 0.005 : 100.0 %\n", "Replicability at alpha = 0.001 : 100.0 %\n", "*****************************************************************\n", "netmats_pearson SVR CogTotalComp_AgeAdj 50\n", "-0.30982472602999245 0.11287333731403326\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_pearson SVR CogTotalComp_AgeAdj 100\n", "-0.13890778666026496 0.12805067469075515\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_pearson SVR CogTotalComp_AgeAdj 200\n", "-7.21805212150481e-05 0.1730921499219021\n", "Replicability at alpha = 0.05 : 61.53846153846154 %\n", "Replicability at alpha = 0.01 : 15.384615384615385 %\n", "Replicability at alpha = 0.005 : 7.6923076923076925 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson SVR CogTotalComp_AgeAdj 300\n", "0.09110523124823479 0.19749525772200202\n", "Replicability at alpha = 0.05 : 88.23529411764706 %\n", "Replicability at alpha = 0.01 : 43.13725490196079 %\n", "Replicability at alpha = 0.005 : 25.49019607843137 %\n", "Replicability at alpha = 0.001 : 5.88235294117647 %\n", "*****************************************************************\n", "netmats_pearson SVR CogTotalComp_AgeAdj max\n", "0.17509369508119277 0.23880005412473712\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 96.96969696969697 %\n", "Replicability at alpha = 0.005 : 91.91919191919192 %\n", "Replicability at alpha = 0.001 : 71.71717171717171 %\n", "*****************************************************************\n", "netmats_pearson SVR PMAT24_A_CR 50\n", "-0.18522734626282625 0.0659513189654039\n", "Replicability at alpha = 0.05 : 0.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson SVR PMAT24_A_CR 100\n", "-0.03541889369372119 0.10663680239430814\n", "Replicability at alpha = 0.05 : 22.22222222222222 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson SVR PMAT24_A_CR 200\n", "0.07480305222747267 0.16771321619124785\n", "Replicability at alpha = 0.05 : 76.47058823529412 %\n", "Replicability at alpha = 0.01 : 23.52941176470588 %\n", "Replicability at alpha = 0.005 : 8.823529411764707 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson SVR PMAT24_A_CR 300\n", "0.14465996000988599 0.2164368604723245\n", "Replicability at alpha = 0.05 : 94.66666666666667 %\n", "Replicability at alpha = 0.01 : 65.33333333333333 %\n", "Replicability at alpha = 0.005 : 50.66666666666667 %\n", "Replicability at alpha = 0.001 : 13.333333333333334 %\n", "*****************************************************************\n", "netmats_pearson SVR PMAT24_A_CR max\n", "0.23067026527717135 0.26365788015471814\n", "Replicability at alpha = 0.05 : 100.0 %\n", "Replicability at alpha = 0.01 : 100.0 %\n", "Replicability at alpha = 0.005 : 99.0 %\n", "Replicability at alpha = 0.001 : 93.0 %\n", "*****************************************************************\n", "netmats_pearson SVR Flanker_AgeAdj 50\n", "-0.27036407987316635 0.03746547055481584\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_pearson SVR Flanker_AgeAdj 100\n", "-0.14100630065170205 0.05657798711077884\n", "Replicability at alpha = 0.05 : 66.66666666666666 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson SVR Flanker_AgeAdj 200\n", "-0.055116793515234415 0.05799142750256638\n", "Replicability at alpha = 0.05 : 0.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson SVR Flanker_AgeAdj 300\n", "0.004240141987394162 0.08004593409063611\n", "Replicability at alpha = 0.05 : 23.076923076923077 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson SVR Flanker_AgeAdj max\n", "0.05566085950364401 0.11481030338787498\n", "Replicability at alpha = 0.05 : 75.75757575757575 %\n", "Replicability at alpha = 0.01 : 15.151515151515152 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson SVR CardSort_AgeAdj 50\n", "-0.2643667597050225 0.07491052110663295\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_pearson SVR CardSort_AgeAdj 100\n", "-0.10778687253303708 0.08392608342465463\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_pearson SVR CardSort_AgeAdj 200\n", "0.00894686954658656 0.10845619502551797\n", "Replicability at alpha = 0.05 : 35.294117647058826 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson SVR CardSort_AgeAdj 300\n", "0.058572453315874566 0.1317703098552681\n", "Replicability at alpha = 0.05 : 59.375 %\n", "Replicability at alpha = 0.01 : 12.5 %\n", "Replicability at alpha = 0.005 : 6.25 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson SVR CardSort_AgeAdj max\n", "0.11015433267063868 0.1472505836113826\n", "Replicability at alpha = 0.05 : 96.0 %\n", "Replicability at alpha = 0.01 : 52.0 %\n", "Replicability at alpha = 0.005 : 30.666666666666664 %\n", "Replicability at alpha = 0.001 : 4.0 %\n", "*****************************************************************\n", "netmats_pearson SVR PicSeq_AgeAdj 50\n", "-0.2938000509765391 0.061028243473976014\n", "Replicability at alpha = 0.05 : nan %\n", "Replicability at alpha = 0.01 : nan %\n", "Replicability at alpha = 0.005 : nan %\n", "Replicability at alpha = 0.001 : nan %\n", "*****************************************************************\n", "netmats_pearson SVR PicSeq_AgeAdj 100\n", "-0.1642823732841255 0.08346539469408594\n", "Replicability at alpha = 0.05 : 0.0 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson SVR PicSeq_AgeAdj 200\n", "-0.05458187746701462 0.10608631158346489\n", "Replicability at alpha = 0.05 : 66.66666666666666 %\n", "Replicability at alpha = 0.01 : 0.0 %\n", "Replicability at alpha = 0.005 : 0.0 %\n", "Replicability at alpha = 0.001 : 0.0 %\n", "*****************************************************************\n", "netmats_pearson SVR PicSeq_AgeAdj 300\n", "0.021856737276436368 0.12677891257700286\n", "Replicability at alpha = 0.05 : 75.0 %\n", "Replicability at alpha = 0.01 : 20.0 %\n", "Replicability at alpha = 0.005 : 10.0 %\n", "Replicability at alpha = 0.001 : 5.0 %\n", "*****************************************************************\n", "netmats_pearson SVR PicSeq_AgeAdj max\n", "0.09644924771734871 0.1680084950838603\n", "Replicability at alpha = 0.05 : 95.38461538461539 %\n", "Replicability at alpha = 0.01 : 47.69230769230769 %\n", "Replicability at alpha = 0.005 : 38.46153846153847 %\n", "Replicability at alpha = 0.001 : 15.384615384615385 %\n", "CPU times: user 24.1 s, sys: 6.05 s, total: 30.2 s\n", "Wall time: 3h 47min 20s\n" ] }, { "data": { "text/plain": " connectivity model target n r_discovery_cv \\\n0 netmats_parcor SVR age 50 -0.330564 \n1 netmats_parcor SVR age 50 -0.023555 \n2 netmats_parcor SVR age 50 -0.176967 \n3 netmats_parcor SVR age 50 0.093535 \n4 netmats_parcor SVR age 50 0.220397 \n... ... ... ... ... ... \n5995 netmats_pearson SVR PicSeq_AgeAdj 501 0.167868 \n5996 netmats_pearson SVR PicSeq_AgeAdj 501 0.134985 \n5997 netmats_pearson SVR PicSeq_AgeAdj 501 0.155819 \n5998 netmats_pearson SVR PicSeq_AgeAdj 501 0.065164 \n5999 netmats_pearson SVR PicSeq_AgeAdj 501 0.044413 \n\n r_discovery_overfit r_replication p_discovery_cv p_discovery_overfit \\\n0 0.899135 0.288430 0.994006 0.000999 \n1 0.963181 0.291646 0.591409 0.000999 \n2 0.968481 0.092822 0.884116 0.000999 \n3 0.960486 -0.161212 0.299700 0.000999 \n4 0.958377 0.310812 0.072927 0.000999 \n... ... ... ... ... \n5995 0.469294 0.119368 0.000999 0.000999 \n5996 0.493440 0.222072 0.001998 0.000999 \n5997 0.522091 0.223508 0.000999 0.000999 \n5998 0.484062 0.215828 0.081918 0.000999 \n5999 0.398393 0.100157 0.139860 0.000999 \n\n p_replication \n0 0.029970 \n1 0.019980 \n2 0.244755 \n3 0.854146 \n4 0.016983 \n... ... \n5995 0.004995 \n5996 0.000999 \n5997 0.000999 \n5998 0.000999 \n5999 0.018981 \n\n[6000 rows x 10 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
connectivitymodeltargetnr_discovery_cvr_discovery_overfitr_replicationp_discovery_cvp_discovery_overfitp_replication
0netmats_parcorSVRage50-0.3305640.8991350.2884300.9940060.0009990.029970
1netmats_parcorSVRage50-0.0235550.9631810.2916460.5914090.0009990.019980
2netmats_parcorSVRage50-0.1769670.9684810.0928220.8841160.0009990.244755
3netmats_parcorSVRage500.0935350.960486-0.1612120.2997000.0009990.854146
4netmats_parcorSVRage500.2203970.9583770.3108120.0729270.0009990.016983
.................................
5995netmats_pearsonSVRPicSeq_AgeAdj5010.1678680.4692940.1193680.0009990.0009990.004995
5996netmats_pearsonSVRPicSeq_AgeAdj5010.1349850.4934400.2220720.0019980.0009990.000999
5997netmats_pearsonSVRPicSeq_AgeAdj5010.1558190.5220910.2235080.0009990.0009990.000999
5998netmats_pearsonSVRPicSeq_AgeAdj5010.0651640.4840620.2158280.0819180.0009990.000999
5999netmats_pearsonSVRPicSeq_AgeAdj5010.0444130.3983930.1001570.1398600.0009990.018981
\n

6000 rows × 10 columns

\n
" }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "random_state = 42\n", "n_bootstrap = 100\n", "\n", "features = {\n", " 'netmats_parcor': netmats_parcor,\n", " 'netmats_pearson': netmats_pearson\n", "}\n", "\n", "models = {\n", " 'SVR': SVR()\n", "}\n", "\n", "# We aggregate all results here:\n", "df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])\n", "\n", "for feature_set in features:\n", " for model in models:\n", " for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:\n", " for sample_size in [50, 100, 200, 300, 'max']:\n", "\n", " print('*****************************************************************')\n", " print(feature_set, model, target_var, sample_size)\n", "\n", " X, y = create_data(target=target_var, feature_data=features[feature_set])\n", "\n", " if sample_size=='max':\n", " sample_size = int(len(y)/2)\n", "\n", " # create random seeds for each bootstrap iteration for reproducibility\n", " rng = np.random.default_rng(random_state)\n", " random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)\n", "\n", " # run bootstrap iterations in parallel\n", " r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(\n", " *Parallel(n_jobs=-1)(\n", " delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))\n", "\n", " tmp_data_frame = pd.DataFrame({\n", " 'connectivity' : feature_set,\n", " 'model' : model,\n", " 'target' : target_var,\n", " 'n' : sample_size,\n", " 'r_discovery_cv': r_discovery_cv,\n", " 'r_discovery_overfit': r_discovery_overfit,\n", " 'r_replication': r_replication,\n", " 'p_discovery_cv': p_discovery_cv,\n", " 'p_discovery_overfit': p_discovery_overfit,\n", " 'p_replication': p_replication\n", " })\n", " #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)\n", " #plt.ylabel('in-sample (r)')\n", " #plt.xlabel('out-of-sample (r_pred)')\n", " #plt.show()\n", " print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())\n", "\n", " for alpha in [0.05, 0.01, 0.005, 0.001]:\n", " print('Replicability at alpha =', alpha, ':',\n", " (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
connectivitymodeltargetnr_discovery_cvr_discovery_overfitr_replicationp_discovery_cvp_discovery_overfitp_replication
0netmats_parcorPCA_Ridgeage50-0.2606360.4579830.2259770.9690310.0019980.062937
1netmats_parcorPCA_Ridgeage500.1897520.6876110.2466910.0989010.0009990.049950
2netmats_parcorPCA_Ridgeage50-0.0143290.730443-0.0456660.5384620.0009990.601399
3netmats_parcorPCA_Ridgeage500.1945670.710963-0.1623780.1018980.0009990.861139
4netmats_parcorPCA_Ridgeage500.2234280.7176700.3045250.0759240.0009990.015984
.................................
5995netmats_pearsonPCA_RidgePicSeq_AgeAdj5010.1748580.2862060.0869550.0009990.0009990.029970
5996netmats_pearsonPCA_RidgePicSeq_AgeAdj5010.1141240.2617990.1463300.0069930.0009990.000999
5997netmats_pearsonPCA_RidgePicSeq_AgeAdj5010.1931510.3108000.1647870.0009990.0009990.000999
5998netmats_pearsonPCA_RidgePicSeq_AgeAdj5010.1441120.2851250.0959990.0029970.0009990.018981
5999netmats_pearsonPCA_RidgePicSeq_AgeAdj5010.0285260.2121400.0913880.2567430.0009990.021978
\n

6000 rows × 10 columns

\n" }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "random_state = 42\n", "n_bootstrap = 100\n", "\n", "features = {\n", " 'netmats_parcor': netmats_parcor,\n", " 'netmats_pearson': netmats_pearson\n", "}\n", "\n", "models = {\n", " 'PCA_Ridge': Pipeline([('pca', PCA(n_components=0.5)),\n", " ('ridge', Ridge())])\n", "\n", "}\n", "\n", "# We aggregate all results here:\n", "df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])\n", "\n", "for feature_set in features:\n", " for model in models:\n", " for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:\n", " for sample_size in [50, 100, 200, 300, 'max']:\n", "\n", " print('*****************************************************************')\n", " print(feature_set, model, target_var, sample_size)\n", "\n", " X, y = create_data(target=target_var, feature_data=features[feature_set])\n", "\n", " if sample_size=='max':\n", " sample_size = int(len(y)/2)\n", "\n", " # create random seeds for each bootstrap iteration for reproducibility\n", " rng = np.random.default_rng(random_state)\n", " random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)\n", "\n", " # run bootstrap iterations in parallel\n", " r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(\n", " *Parallel(n_jobs=-1)(\n", " delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))\n", "\n", " tmp_data_frame = pd.DataFrame({\n", " 'connectivity' : feature_set,\n", " 'model' : model,\n", " 'target' : target_var,\n", " 'n' : sample_size,\n", " 'r_discovery_cv': r_discovery_cv,\n", " 'r_discovery_overfit': r_discovery_overfit,\n", " 'r_replication': r_replication,\n", " 'p_discovery_cv': p_discovery_cv,\n", " 'p_discovery_overfit': p_discovery_overfit,\n", " 'p_replication': p_replication\n", " })\n", " #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)\n", " #plt.ylabel('in-sample (r)')\n", " #plt.xlabel('out-of-sample (r_pred)')\n", " #plt.show()\n", " print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())\n", "\n", " for alpha in [0.05, 0.01, 0.005, 0.001]:\n", " print('Replicability at alpha =', alpha, ':',\n", " (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
connectivitymodeltargetnr_discovery_cvr_discovery_overfitr_replicationp_discovery_cvp_discovery_overfitp_replication
0netmats_parcorKernelRidgeage50-0.2895301.00.0979630.9810190.0009990.253746
1netmats_parcorKernelRidgeage50-0.4184381.00.0201280.9990010.0009990.449550
2netmats_parcorKernelRidgeage50-0.1410161.0-0.1407650.8221780.0009990.838162
3netmats_parcorKernelRidgeage500.1316421.0-0.1026260.1838160.0009990.760240
4netmats_parcorKernelRidgeage500.0246831.00.2039180.4265730.0009990.079920
.................................
5995netmats_pearsonKernelRidgePicSeq_AgeAdj5010.2027201.00.2259790.0009990.0009990.000999
5996netmats_pearsonKernelRidgePicSeq_AgeAdj5010.2233861.00.2176580.0009990.0009990.000999
5997netmats_pearsonKernelRidgePicSeq_AgeAdj5010.2747741.00.2064990.0009990.0009990.000999
5998netmats_pearsonKernelRidgePicSeq_AgeAdj5010.1665911.00.2032360.0009990.0009990.000999
5999netmats_pearsonKernelRidgePicSeq_AgeAdj5010.1426751.00.2175540.0019980.0009990.000999
\n

6000 rows × 10 columns

\n" }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "from sklearn.kernel_ridge import KernelRidge\n", "\n", "random_state = 42\n", "n_bootstrap = 100\n", "\n", "features = {\n", " 'netmats_parcor': netmats_parcor,\n", " 'netmats_pearson': netmats_pearson\n", "}\n", "\n", "models = {\n", " 'KernelRidge': KernelRidge()\n", "\n", "}\n", "\n", "# We aggregate all results here:\n", "df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])\n", "\n", "for feature_set in features:\n", " for model in models:\n", " for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:\n", " for sample_size in [50, 100, 200, 300, 'max']:\n", "\n", " print('*****************************************************************')\n", " print(feature_set, model, target_var, sample_size)\n", "\n", " X, y = create_data(target=target_var, feature_data=features[feature_set])\n", "\n", " if sample_size=='max':\n", " sample_size = int(len(y)/2)\n", "\n", " # create random seeds for each bootstrap iteration for reproducibility\n", " rng = np.random.default_rng(random_state)\n", " random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)\n", "\n", " # run bootstrap iterations in parallel\n", " r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(\n", " *Parallel(n_jobs=-1)(\n", " delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))\n", "\n", " tmp_data_frame = pd.DataFrame({\n", " 'connectivity' : feature_set,\n", " 'model' : model,\n", " 'target' : target_var,\n", " 'n' : sample_size,\n", " 'r_discovery_cv': r_discovery_cv,\n", " 'r_discovery_overfit': r_discovery_overfit,\n", " 'r_replication': r_replication,\n", " 'p_discovery_cv': p_discovery_cv,\n", " 'p_discovery_overfit': p_discovery_overfit,\n", " 'p_replication': p_replication\n", " })\n", " #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)\n", " #plt.ylabel('in-sample (r)')\n", " #plt.xlabel('out-of-sample (r_pred)')\n", " #plt.show()\n", " print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())\n", "\n", " for alpha in [0.05, 0.01, 0.005, 0.001]:\n", " print('Replicability at alpha =', alpha, ':',\n", " (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']