{ "cells": [ { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "# Analyzing replicability of connectivity-based multivariate BWAS on the Human Connectome Project dataset\n", "\n", "### Re-compute certain results for more sample sizes.\n", "\n", "## Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2021-08-03T20:04:15.431840Z", "start_time": "2021-08-03T20:04:14.753565Z" }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sklearn.linear_model import Ridge\n", "from sklearn.svm import SVR\n", "from sklearn.model_selection import KFold, train_test_split, cross_val_predict, GridSearchCV\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.decomposition import PCA\n", "from joblib import Parallel, delayed\n", "from mlxtend.evaluate import permutation_test\n", "sns.set(rc={\"figure.figsize\":(4, 2)})\n", "sns.set_style(\"whitegrid\")" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Load HCP data\n", "\n", "We load functional network matrices (netmats) from the HCP1200-release, as published on connectomeDB: https://db.humanconnectome.org/\n", "Due to licensoing issues, data is not supplied with the repository, but can be downloaded from the ConnectomeDB.\n", "See [hcp_data/readme.md](hcp_data/readme.md) for more details." ] }, { "cell_type": "code", "execution_count": 2, "outputs": [ { "data": { "text/plain": " Release Acquisition Gender Age 3T_Full_MR_Compl T1_Count \\\nSubject \n100004 S900 Q06 M 22-25 False 0 \n100206 S900 Q11 M 26-30 True 1 \n100307 Q1 Q01 F 26-30 True 1 \n100408 Q3 Q03 M 31-35 True 1 \n100610 S900 Q08 M 26-30 True 2 \n... ... ... ... ... ... ... \n992774 Q2 Q02 M 31-35 True 2 \n993675 S900 Q09 F 26-30 True 2 \n994273 S500 Q06 M 26-30 True 1 \n995174 S1200 Q13 M 22-25 False 1 \n996782 S900 Q08 F 26-30 True 2 \n\n T2_Count 3T_RS-fMRI_Count 3T_RS-fMRI_PctCompl 3T_Full_Task_fMRI \\\nSubject \n100004 0 0 0.0 False \n100206 1 4 100.0 True \n100307 1 4 100.0 True \n100408 1 4 100.0 True \n100610 1 4 100.0 True \n... ... ... ... ... \n992774 2 4 100.0 True \n993675 2 4 100.0 True \n994273 1 4 100.0 True \n995174 1 2 0.0 True \n996782 2 4 100.0 True \n\n ... Odor_Unadj Odor_AgeAdj PainIntens_RawScore PainInterf_Tscore \\\nSubject ... \n100004 ... 101.12 86.45 2.0 45.9 \n100206 ... 108.79 97.19 1.0 49.7 \n100307 ... 101.12 86.45 0.0 38.6 \n100408 ... 108.79 98.04 2.0 52.6 \n100610 ... 122.25 110.45 0.0 38.6 \n... ... ... ... ... ... \n992774 ... 122.25 111.41 4.0 50.1 \n993675 ... 122.25 110.45 0.0 38.6 \n994273 ... 122.25 111.41 7.0 63.8 \n995174 ... 88.61 64.58 3.0 50.1 \n996782 ... 108.79 97.19 0.0 38.6 \n\n Taste_Unadj Taste_AgeAdj Mars_Log_Score Mars_Errs Mars_Final \\\nSubject \n100004 107.17 105.31 1.80 0.0 1.80 \n100206 72.63 72.03 1.84 0.0 1.84 \n100307 71.69 71.76 1.76 0.0 1.76 \n100408 114.01 113.59 1.76 2.0 1.68 \n100610 84.84 85.31 1.92 1.0 1.88 \n... ... ... ... ... ... \n992774 107.17 103.55 1.76 0.0 1.76 \n993675 84.07 84.25 1.80 1.0 1.76 \n994273 110.65 109.73 1.80 1.0 1.76 \n995174 117.16 117.40 1.80 0.0 1.80 \n996782 75.43 73.72 1.84 0.0 1.84 \n\n age \nSubject \n100004 23.5 \n100206 28.0 \n100307 28.0 \n100408 33.0 \n100610 28.0 \n... ... \n992774 33.0 \n993675 28.0 \n994273 28.0 \n995174 23.5 \n996782 28.0 \n\n[1206 rows x 582 columns]", "text/html": "
| \n | Release | \nAcquisition | \nGender | \nAge | \n3T_Full_MR_Compl | \nT1_Count | \nT2_Count | \n3T_RS-fMRI_Count | \n3T_RS-fMRI_PctCompl | \n3T_Full_Task_fMRI | \n... | \nOdor_Unadj | \nOdor_AgeAdj | \nPainIntens_RawScore | \nPainInterf_Tscore | \nTaste_Unadj | \nTaste_AgeAdj | \nMars_Log_Score | \nMars_Errs | \nMars_Final | \nage | \n
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Subject | \n\n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n |
| 100004 | \nS900 | \nQ06 | \nM | \n22-25 | \nFalse | \n0 | \n0 | \n0 | \n0.0 | \nFalse | \n... | \n101.12 | \n86.45 | \n2.0 | \n45.9 | \n107.17 | \n105.31 | \n1.80 | \n0.0 | \n1.80 | \n23.5 | \n
| 100206 | \nS900 | \nQ11 | \nM | \n26-30 | \nTrue | \n1 | \n1 | \n4 | \n100.0 | \nTrue | \n... | \n108.79 | \n97.19 | \n1.0 | \n49.7 | \n72.63 | \n72.03 | \n1.84 | \n0.0 | \n1.84 | \n28.0 | \n
| 100307 | \nQ1 | \nQ01 | \nF | \n26-30 | \nTrue | \n1 | \n1 | \n4 | \n100.0 | \nTrue | \n... | \n101.12 | \n86.45 | \n0.0 | \n38.6 | \n71.69 | \n71.76 | \n1.76 | \n0.0 | \n1.76 | \n28.0 | \n
| 100408 | \nQ3 | \nQ03 | \nM | \n31-35 | \nTrue | \n1 | \n1 | \n4 | \n100.0 | \nTrue | \n... | \n108.79 | \n98.04 | \n2.0 | \n52.6 | \n114.01 | \n113.59 | \n1.76 | \n2.0 | \n1.68 | \n33.0 | \n
| 100610 | \nS900 | \nQ08 | \nM | \n26-30 | \nTrue | \n2 | \n1 | \n4 | \n100.0 | \nTrue | \n... | \n122.25 | \n110.45 | \n0.0 | \n38.6 | \n84.84 | \n85.31 | \n1.92 | \n1.0 | \n1.88 | \n28.0 | \n
| ... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
| 992774 | \nQ2 | \nQ02 | \nM | \n31-35 | \nTrue | \n2 | \n2 | \n4 | \n100.0 | \nTrue | \n... | \n122.25 | \n111.41 | \n4.0 | \n50.1 | \n107.17 | \n103.55 | \n1.76 | \n0.0 | \n1.76 | \n33.0 | \n
| 993675 | \nS900 | \nQ09 | \nF | \n26-30 | \nTrue | \n2 | \n2 | \n4 | \n100.0 | \nTrue | \n... | \n122.25 | \n110.45 | \n0.0 | \n38.6 | \n84.07 | \n84.25 | \n1.80 | \n1.0 | \n1.76 | \n28.0 | \n
| 994273 | \nS500 | \nQ06 | \nM | \n26-30 | \nTrue | \n1 | \n1 | \n4 | \n100.0 | \nTrue | \n... | \n122.25 | \n111.41 | \n7.0 | \n63.8 | \n110.65 | \n109.73 | \n1.80 | \n1.0 | \n1.76 | \n28.0 | \n
| 995174 | \nS1200 | \nQ13 | \nM | \n22-25 | \nFalse | \n1 | \n1 | \n2 | \n0.0 | \nTrue | \n... | \n88.61 | \n64.58 | \n3.0 | \n50.1 | \n117.16 | \n117.40 | \n1.80 | \n0.0 | \n1.80 | \n23.5 | \n
| 996782 | \nS900 | \nQ08 | \nF | \n26-30 | \nTrue | \n2 | \n2 | \n4 | \n100.0 | \nTrue | \n... | \n108.79 | \n97.19 | \n0.0 | \n38.6 | \n75.43 | \n73.72 | \n1.84 | \n0.0 | \n1.84 | \n28.0 | \n
1206 rows × 582 columns
\n| \n | connectivity | \nmodel | \ntarget | \nn | \nr_discovery_cv | \nr_discovery_overfit | \nr_replication | \np_discovery_cv | \np_discovery_overfit | \np_replication | \n
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \nnetmats_pearson | \nPCA_SVR | \nage | \n25 | \nNaN | \n0.466156 | \n-0.139314 | \n0.995005 | \n0.017982 | \n0.757243 | \n
| 1 | \nnetmats_pearson | \nPCA_SVR | \nage | \n25 | \nNaN | \n0.689387 | \n0.196194 | \n0.94006 | \n0.000999 | \n0.180819 | \n
| 2 | \nnetmats_pearson | \nPCA_SVR | \nage | \n25 | \nNaN | \n0.681033 | \n0.064105 | \n0.944056 | \n0.000999 | \n0.313686 | \n
| 3 | \nnetmats_pearson | \nPCA_SVR | \nage | \n25 | \nNaN | \n0.513247 | \n0.003161 | \n0.998002 | \n0.008991 | \n0.526474 | \n
| 4 | \nnetmats_pearson | \nPCA_SVR | \nage | \n25 | \nNaN | \n0.870617 | \n0.063318 | \n1.0 | \n0.000999 | \n0.376623 | \n
| ... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
| 11995 | \nnetmats_pearson | \nPCA_SVR | \nPicSeq_AgeAdj | \n501 | \n0.170381 | \n0.360809 | \n0.062856 | \n0.004995 | \n0.000999 | \n0.085914 | \n
| 11996 | \nnetmats_pearson | \nPCA_SVR | \nPicSeq_AgeAdj | \n501 | \n0.095436 | \n0.352693 | \n0.142834 | \n0.085914 | \n0.000999 | \n0.000999 | \n
| 11997 | \nnetmats_pearson | \nPCA_SVR | \nPicSeq_AgeAdj | \n501 | \n0.098129 | \n0.373386 | \n0.163908 | \n0.028971 | \n0.000999 | \n0.000999 | \n
| 11998 | \nnetmats_pearson | \nPCA_SVR | \nPicSeq_AgeAdj | \n501 | \n0.12842 | \n0.376163 | \n0.109268 | \n0.065934 | \n0.000999 | \n0.012987 | \n
| 11999 | \nnetmats_pearson | \nPCA_SVR | \nPicSeq_AgeAdj | \n501 | \n0.07409 | \n0.308118 | \n0.032076 | \n0.347652 | \n0.000999 | \n0.225774 | \n
12000 rows × 10 columns
\n| \n | connectivity | \nmodel | \ntarget | \nn | \nr_discovery_cv | \nr_discovery_overfit | \nr_replication | \np_discovery_cv | \np_discovery_overfit | \np_replication | \n
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \nnetmats_parcor | \nridge | \nage | \n25 | \nNaN | \n1.0 | \n0.420725 | \n0.3996 | \n0.000999 | \n0.01998 | \n
| 1 | \nnetmats_parcor | \nridge | \nage | \n25 | \nNaN | \n1.0 | \n0.183304 | \n0.005994 | \n0.000999 | \n0.197802 | \n
| 2 | \nnetmats_parcor | \nridge | \nage | \n25 | \nNaN | \n1.0 | \n0.261131 | \n0.33966 | \n0.000999 | \n0.106893 | \n
| 3 | \nnetmats_parcor | \nridge | \nage | \n25 | \nNaN | \n1.0 | \n0.010133 | \n0.316683 | \n0.000999 | \n0.518482 | \n
| 4 | \nnetmats_parcor | \nridge | \nage | \n25 | \nNaN | \n1.0 | \n0.267432 | \n1.0 | \n0.000999 | \n0.107892 | \n
| ... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
| 11995 | \nnetmats_parcor | \nridge | \nPicSeq_AgeAdj | \n501 | \n0.169242 | \n1.0 | \n0.214501 | \n0.001998 | \n0.000999 | \n0.000999 | \n
| 11996 | \nnetmats_parcor | \nridge | \nPicSeq_AgeAdj | \n501 | \n0.236193 | \n1.0 | \n0.17445 | \n0.000999 | \n0.000999 | \n0.000999 | \n
| 11997 | \nnetmats_parcor | \nridge | \nPicSeq_AgeAdj | \n501 | \n0.201298 | \n1.0 | \n0.265569 | \n0.000999 | \n0.000999 | \n0.000999 | \n
| 11998 | \nnetmats_parcor | \nridge | \nPicSeq_AgeAdj | \n501 | \n0.261004 | \n1.0 | \n0.19719 | \n0.000999 | \n0.000999 | \n0.000999 | \n
| 11999 | \nnetmats_parcor | \nridge | \nPicSeq_AgeAdj | \n501 | \n0.108183 | \n1.0 | \n0.203605 | \n0.011988 | \n0.000999 | \n0.000999 | \n
12000 rows × 10 columns
\n" }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "random_state = 42\n", "n_bootstrap = 100\n", "\n", "features = {\n", " 'netmats_parcor': netmats_parcor,\n", "}\n", "\n", "models = {\n", " 'ridge': Ridge()\n", "}\n", "\n", "# We aggregate all results here:\n", "df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])\n", "\n", "for feature_set in features:\n", " for model in models:\n", " for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:\n", " for sample_size in [25, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275, 300, 325, 350, 375, 400, 425, 450, 475, 'max']:\n", "\n", " print('*****************************************************************')\n", " print(feature_set, model, target_var, sample_size)\n", "\n", " X, y = create_data(target=target_var, feature_data=features[feature_set])\n", "\n", " if sample_size=='max':\n", " sample_size = int(len(y)/2)\n", "\n", " # create random seeds for each bootstrap iteration for reproducibility\n", " rng = np.random.default_rng(random_state)\n", " random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)\n", "\n", " # run bootstrap iterations in parallel\n", " r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(\n", " *Parallel(n_jobs=-1)(\n", " delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))\n", "\n", " tmp_data_frame = pd.DataFrame({\n", " 'connectivity' : feature_set,\n", " 'model' : model,\n", " 'target' : target_var,\n", " 'n' : sample_size,\n", " 'r_discovery_cv': r_discovery_cv,\n", " 'r_discovery_overfit': r_discovery_overfit,\n", " 'r_replication': r_replication,\n", " 'p_discovery_cv': p_discovery_cv,\n", " 'p_discovery_overfit': p_discovery_overfit,\n", " 'p_replication': p_replication\n", " })\n", " #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)\n", " #plt.ylabel('in-sample (r)')\n", " #plt.xlabel('out-of-sample (r_pred)')\n", " #plt.show()\n", " print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())\n", "\n", " for alpha in [0.05, 0.01, 0.005, 0.001]:\n", " print('Replicability at alpha =', alpha, ':',\n", " (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']| \n | connectivity | \nmodel | \ntarget | \nn | \nr_discovery_cv | \nr_discovery_overfit | \nr_replication | \np_discovery_cv | \np_discovery_overfit | \np_replication | \n
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \nnetmats_pearson | \nPCA_SVR | \nage | \n25 | \nNaN | \n0.640453 | \n0.249772 | \n0.904096 | \n0.000999 | \n0.110889 | \n
| 1 | \nnetmats_pearson | \nPCA_SVR | \nage | \n25 | \nNaN | \n0.674393 | \n-0.05573 | \n0.959041 | \n0.000999 | \n0.606394 | \n
| 2 | \nnetmats_pearson | \nPCA_SVR | \nage | \n25 | \nNaN | \n0.656076 | \n-0.149824 | \n0.908092 | \n0.000999 | \n0.724276 | \n
| 3 | \nnetmats_pearson | \nPCA_SVR | \nage | \n25 | \nNaN | \n0.691142 | \n0.022858 | \n0.40959 | \n0.000999 | \n0.458541 | \n
| 4 | \nnetmats_pearson | \nPCA_SVR | \nage | \n25 | \nNaN | \n0.723121 | \n-0.185205 | \n1.0 | \n0.000999 | \n0.833167 | \n
| ... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
| 11995 | \nnetmats_pearson | \nPCA_SVR | \nPicSeq_AgeAdj | \n501 | \n0.017149 | \n0.334397 | \n-0.004167 | \n0.796204 | \n0.000999 | \n0.542458 | \n
| 11996 | \nnetmats_pearson | \nPCA_SVR | \nPicSeq_AgeAdj | \n501 | \n-0.090964 | \n0.325431 | \n-0.060089 | \n0.99001 | \n0.000999 | \n0.899101 | \n
| 11997 | \nnetmats_pearson | \nPCA_SVR | \nPicSeq_AgeAdj | \n501 | \n0.073699 | \n0.307215 | \n-0.083865 | \n0.23976 | \n0.000999 | \n0.973027 | \n
| 11998 | \nnetmats_pearson | \nPCA_SVR | \nPicSeq_AgeAdj | \n501 | \n0.039265 | \n0.311807 | \n-0.081005 | \n0.305694 | \n0.000999 | \n0.962038 | \n
| 11999 | \nnetmats_pearson | \nPCA_SVR | \nPicSeq_AgeAdj | \n501 | \n-0.068938 | \n0.278715 | \n0.065742 | \n0.987013 | \n0.000999 | \n0.08991 | \n
12000 rows × 10 columns
\n| \n | connectivity | \nmodel | \ntarget | \nn | \nr_discovery_cv | \nr_discovery_overfit | \nr_replication | \np_discovery_cv | \np_discovery_overfit | \np_replication | \n
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \nnetmats_parcor | \nRidge | \nage | \n25 | \nNaN | \n1.0 | \n-0.107322 | \n0.966034 | \n0.000999 | \n0.716284 | \n
| 1 | \nnetmats_parcor | \nRidge | \nage | \n25 | \nNaN | \n1.0 | \n0.218641 | \n0.997003 | \n0.000999 | \n0.144855 | \n
| 2 | \nnetmats_parcor | \nRidge | \nage | \n25 | \nNaN | \n1.0 | \n0.135509 | \n0.505495 | \n0.000999 | \n0.273726 | \n
| 3 | \nnetmats_parcor | \nRidge | \nage | \n25 | \nNaN | \n1.0 | \n-0.186092 | \n0.988012 | \n0.000999 | \n0.812188 | \n
| 4 | \nnetmats_parcor | \nRidge | \nage | \n25 | \nNaN | \n1.0 | \n0.241457 | \n0.866134 | \n0.000999 | \n0.130869 | \n
| ... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
| 11995 | \nnetmats_parcor | \nRidge | \nPicSeq_AgeAdj | \n501 | \n0.001456 | \n1.0 | \n-0.069881 | \n0.512488 | \n0.000999 | \n0.941059 | \n
| 11996 | \nnetmats_parcor | \nRidge | \nPicSeq_AgeAdj | \n501 | \n-0.073583 | \n1.0 | \n-0.004422 | \n0.943057 | \n0.000999 | \n0.526474 | \n
| 11997 | \nnetmats_parcor | \nRidge | \nPicSeq_AgeAdj | \n501 | \n0.04575 | \n1.0 | \n-0.040273 | \n0.087912 | \n0.000999 | \n0.824176 | \n
| 11998 | \nnetmats_parcor | \nRidge | \nPicSeq_AgeAdj | \n501 | \n0.037359 | \n1.0 | \n0.030245 | \n0.267732 | \n0.000999 | \n0.256743 | \n
| 11999 | \nnetmats_parcor | \nRidge | \nPicSeq_AgeAdj | \n501 | \n0.180913 | \n1.0 | \n-0.007398 | \n0.000999 | \n0.000999 | \n0.551449 | \n
12000 rows × 10 columns
\n" }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "random_state = 42\n", "n_bootstrap = 100\n", "\n", "features = {\n", " 'netmats_parcor': netmats_parcor\n", "}\n", "\n", "models = {\n", " 'Ridge': Ridge()\n", "\n", "}\n", "\n", "# We aggregate all results here:\n", "df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])\n", "\n", "for feature_set in features:\n", " for model in models:\n", " for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:\n", " for sample_size in [25, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275, 300, 325, 350, 375, 400, 425, 450, 475, 'max']:\n", "\n", " print('*****************************************************************')\n", " print(feature_set, model, target_var, sample_size)\n", "\n", " X, y = create_data(target=target_var, feature_data=features[feature_set]) # gives a random y when target is None\n", "\n", " if sample_size=='max':\n", " sample_size = int(len(y)/2)\n", "\n", " # create random seeds for each bootstrap iteration for reproducibility\n", " rng = np.random.default_rng(random_state)\n", " random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)\n", "\n", " # run bootstrap iterations in parallel, with shuffle_y=True\n", " r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(\n", " *Parallel(n_jobs=-1)(\n", " delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed, shuffle_y=True) for seed in random_sates))\n", "\n", " tmp_data_frame = pd.DataFrame({\n", " 'connectivity' : feature_set,\n", " 'model' : model,\n", " 'target' : target_var,\n", " 'n' : sample_size,\n", " 'r_discovery_cv': r_discovery_cv,\n", " 'r_discovery_overfit': r_discovery_overfit,\n", " 'r_replication': r_replication,\n", " 'p_discovery_cv': p_discovery_cv,\n", " 'p_discovery_overfit': p_discovery_overfit,\n", " 'p_replication': p_replication\n", " })\n", "\n", " #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)\n", " #plt.ylabel('in-sample (r)')\n", " #plt.xlabel('out-of-sample (r_pred)')\n", " #plt.show()\n", " print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())\n", "\n", " for alpha in [0.05, 0.01, 0.005, 0.001]:\n", " print('Replicability at alpha =', alpha, ':',\n", " (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']