{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Analysis of the ABIDE dataset\n", "\n", "### Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2021-08-03T20:38:07.505483Z", "start_time": "2021-08-03T20:38:07.492504Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/tspisak/src/mlconfound/venv/lib/python3.8/site-packages/nilearn/datasets/__init__.py:86: FutureWarning: Fetchers from the nilearn.datasets module will be updated in version 0.9 to return python strings instead of bytes and Pandas dataframes instead of Numpy arrays.\n", " warn(\"Fetchers from the nilearn.datasets module will be \"\n" ] } ], "source": [ "import warnings\n", "import os\n", "from os.path import join\n", "import numpy as np\n", "import pandas as pd\n", "from scipy.stats import kurtosis, skew\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "sns.set_style(\"whitegrid\")\n", "\n", "from sklearn.model_selection import StratifiedKFold, GridSearchCV\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.feature_selection import VarianceThreshold\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.preprocessing import quantile_transform\n", "from sklearn.metrics import roc_curve, RocCurveDisplay\n", "from nilearn.datasets.utils import _uncompress_file, _fetch_file\n", "from nilearn.connectome import ConnectivityMeasure\n", "\n", "from neurocombat_sklearn import CombatModel\n", "\n", "import statsmodels.api as sm\n", "from statsmodels.regression.linear_model import OLS\n", "from statsmodels.formula.api import ols as ols_f\n", "\n", "from mlconfound.stats import full_confound_test, partial_confound_test\n", "from mlconfound.plot import plot_graph\n", "from mlconfound.stats import _r2_cat_cont, _r2_cont_cont, _r2_cat_cat" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2021-07-31T18:11:08.469487Z", "start_time": "2021-07-31T18:11:08.452669Z" } }, "source": [ "## Load data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2021-08-01T19:07:13.775609Z", "start_time": "2021-08-01T19:07:13.772837Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading data from https://osf.io/hc4md/download ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Downloaded 1803198464 of 1811491701 bytes (99.5%, 0.7s remaining) ...done. (162 seconds, 2 min)\n", "Extracting data from ../data_in/ABIDE/download..... done.\n" ] } ], "source": [ "data_dir = '../data_in/ABIDE'\n", "\n", "url = 'https://osf.io/hc4md/download'\n", "\n", "# Download the zip file, first\n", "dl_file = _fetch_file(url, data_dir=data_dir)\n", "\n", "# Second, uncompress the downloaded zip file\n", "_uncompress_file(dl_file, verbose=2)\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2021-08-01T19:07:13.788158Z", "start_time": "2021-08-01T19:07:13.779135Z" } }, "outputs": [], "source": [ "def _get_paths(phenotypic, atlas, timeseries_dir):\n", " \"\"\"\n", " \"\"\"\n", " timeseries = []\n", " IDs_subject = []\n", " diagnosis = []\n", " subject_ids = phenotypic['SUB_ID']\n", " mean_fd = []\n", " num_fd = []\n", " perc_fd = []\n", " site = []\n", " for index, subject_id in enumerate(subject_ids):\n", " this_pheno = phenotypic[phenotypic['SUB_ID'] == subject_id]\n", " this_timeseries = join(timeseries_dir, atlas,\n", " str(subject_id) + '_timeseries.txt')\n", " if os.path.exists(this_timeseries):\n", " timeseries.append(np.loadtxt(this_timeseries))\n", " IDs_subject.append(subject_id)\n", " diagnosis.append(this_pheno['DX_GROUP'].values[0])\n", " mean_fd.append(this_pheno['func_mean_fd'].values[0])\n", " num_fd.append(this_pheno['func_num_fd'].values[0])\n", " perc_fd.append(this_pheno['func_perc_fd'].values[0])\n", " site.append(this_pheno['SITE_ID'].values[0])\n", " return timeseries, diagnosis, IDs_subject, mean_fd, num_fd, perc_fd, site" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2021-08-01T19:07:43.346661Z", "start_time": "2021-08-01T19:07:13.792338Z" }, "pycharm": { "name": "#%% md\n" } }, "source": [ "Download the phenotypic summary information file form the preprocessed connectomes project.\n", "- First read:\n", " http://preprocessed-connectomes-project.org/abide/download.html\n", "- Then download:\n", " https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Phenotypic_V1_0b_preprocessed1.csv\n", "- Copy the csv file into the data_in/ABIDE directory" ] }, { "cell_type": "code", "execution_count": 4, "outputs": [ { "data": { "text/plain": " SUB_ID X subject SITE_ID FILE_ID DX_GROUP DSM_IV_TR \\\n0 50002 1 50002 PITT no_filename 1 1 \n1 50003 2 50003 PITT Pitt_0050003 1 1 \n2 50004 3 50004 PITT Pitt_0050004 1 1 \n3 50005 4 50005 PITT Pitt_0050005 1 1 \n4 50006 5 50006 PITT Pitt_0050006 1 1 \n... ... ... ... ... ... ... ... \n1107 51583 1108 51583 SBL SBL_0051583 1 2 \n1108 51584 1109 51584 SBL SBL_0051584 1 2 \n1109 51585 1110 51585 SBL SBL_0051585 1 1 \n1110 51606 1111 51606 MAX_MUN MaxMun_a_0051606 1 2 \n1111 51607 1112 51607 MAX_MUN MaxMun_a_0051607 1 2 \n\n AGE_AT_SCAN SEX HANDEDNESS_CATEGORY ... qc_notes_rater_1 \\\n0 16.77 1 Ambi ... NaN \n1 24.45 1 R ... NaN \n2 19.09 1 R ... NaN \n3 13.73 2 R ... NaN \n4 13.37 1 L ... NaN \n... ... ... ... ... ... \n1107 35.00 1 NaN ... NaN \n1108 49.00 1 NaN ... NaN \n1109 27.00 1 NaN ... NaN \n1110 29.00 2 R ... NaN \n1111 26.00 1 R ... NaN \n\n qc_anat_rater_2 qc_anat_notes_rater_2 qc_func_rater_2 \\\n0 OK NaN fail \n1 OK NaN OK \n2 OK NaN OK \n3 OK NaN maybe \n4 OK NaN maybe \n... ... ... ... \n1107 OK NaN OK \n1108 OK NaN maybe \n1109 OK NaN maybe \n1110 OK NaN maybe \n1111 OK NaN maybe \n\n qc_func_notes_rater_2 qc_anat_rater_3 qc_anat_notes_rater_3 \\\n0 ic-parietal-cerebellum OK NaN \n1 NaN OK NaN \n2 NaN OK NaN \n3 ic-parietal-cerebellum OK NaN \n4 ic-parietal slight OK NaN \n... ... ... ... \n1107 ic-cerebellum-temporal_lobe OK NaN \n1108 vmpfc dropout OK NaN \n1109 ic-cerebellum-temporal_lobe OK NaN \n1110 ic-cerebellum OK NaN \n1111 ic-cerebellum OK NaN \n\n qc_func_rater_3 qc_func_notes_rater_3 SUB_IN_SMP \n0 fail ERROR #24 1 \n1 OK NaN 1 \n2 OK NaN 1 \n3 OK NaN 0 \n4 OK NaN 1 \n... ... ... ... \n1107 OK NaN 0 \n1108 OK NaN 0 \n1109 OK NaN 0 \n1110 OK NaN 0 \n1111 OK NaN 1 \n\n[1112 rows x 104 columns]", "text/html": "
| \n | SUB_ID | \nX | \nsubject | \nSITE_ID | \nFILE_ID | \nDX_GROUP | \nDSM_IV_TR | \nAGE_AT_SCAN | \nSEX | \nHANDEDNESS_CATEGORY | \n... | \nqc_notes_rater_1 | \nqc_anat_rater_2 | \nqc_anat_notes_rater_2 | \nqc_func_rater_2 | \nqc_func_notes_rater_2 | \nqc_anat_rater_3 | \nqc_anat_notes_rater_3 | \nqc_func_rater_3 | \nqc_func_notes_rater_3 | \nSUB_IN_SMP | \n
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n50002 | \n1 | \n50002 | \nPITT | \nno_filename | \n1 | \n1 | \n16.77 | \n1 | \nAmbi | \n... | \nNaN | \nOK | \nNaN | \nfail | \nic-parietal-cerebellum | \nOK | \nNaN | \nfail | \nERROR #24 | \n1 | \n
| 1 | \n50003 | \n2 | \n50003 | \nPITT | \nPitt_0050003 | \n1 | \n1 | \n24.45 | \n1 | \nR | \n... | \nNaN | \nOK | \nNaN | \nOK | \nNaN | \nOK | \nNaN | \nOK | \nNaN | \n1 | \n
| 2 | \n50004 | \n3 | \n50004 | \nPITT | \nPitt_0050004 | \n1 | \n1 | \n19.09 | \n1 | \nR | \n... | \nNaN | \nOK | \nNaN | \nOK | \nNaN | \nOK | \nNaN | \nOK | \nNaN | \n1 | \n
| 3 | \n50005 | \n4 | \n50005 | \nPITT | \nPitt_0050005 | \n1 | \n1 | \n13.73 | \n2 | \nR | \n... | \nNaN | \nOK | \nNaN | \nmaybe | \nic-parietal-cerebellum | \nOK | \nNaN | \nOK | \nNaN | \n0 | \n
| 4 | \n50006 | \n5 | \n50006 | \nPITT | \nPitt_0050006 | \n1 | \n1 | \n13.37 | \n1 | \nL | \n... | \nNaN | \nOK | \nNaN | \nmaybe | \nic-parietal slight | \nOK | \nNaN | \nOK | \nNaN | \n1 | \n
| ... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
| 1107 | \n51583 | \n1108 | \n51583 | \nSBL | \nSBL_0051583 | \n1 | \n2 | \n35.00 | \n1 | \nNaN | \n... | \nNaN | \nOK | \nNaN | \nOK | \nic-cerebellum-temporal_lobe | \nOK | \nNaN | \nOK | \nNaN | \n0 | \n
| 1108 | \n51584 | \n1109 | \n51584 | \nSBL | \nSBL_0051584 | \n1 | \n2 | \n49.00 | \n1 | \nNaN | \n... | \nNaN | \nOK | \nNaN | \nmaybe | \nvmpfc dropout | \nOK | \nNaN | \nOK | \nNaN | \n0 | \n
| 1109 | \n51585 | \n1110 | \n51585 | \nSBL | \nSBL_0051585 | \n1 | \n1 | \n27.00 | \n1 | \nNaN | \n... | \nNaN | \nOK | \nNaN | \nmaybe | \nic-cerebellum-temporal_lobe | \nOK | \nNaN | \nOK | \nNaN | \n0 | \n
| 1110 | \n51606 | \n1111 | \n51606 | \nMAX_MUN | \nMaxMun_a_0051606 | \n1 | \n2 | \n29.00 | \n2 | \nR | \n... | \nNaN | \nOK | \nNaN | \nmaybe | \nic-cerebellum | \nOK | \nNaN | \nOK | \nNaN | \n0 | \n
| 1111 | \n51607 | \n1112 | \n51607 | \nMAX_MUN | \nMaxMun_a_0051607 | \n1 | \n2 | \n26.00 | \n1 | \nR | \n... | \nNaN | \nOK | \nNaN | \nmaybe | \nic-cerebellum | \nOK | \nNaN | \nOK | \nNaN | \n1 | \n
1112 rows × 104 columns
\n