{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import operator\n",
    "import numpy as np\n",
    "import sklearn.preprocessing\n",
    "import sklearn.utils\n",
    "from sklearn.decomposition import PCA \n",
    "from sklearn.kernel_ridge import KernelRidge\n",
    "from sklearn.metrics import mean_absolute_error, accuracy_score\n",
    "import sklearn.metrics as sklm\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn import linear_model\n",
    "from sklearn.model_selection import train_test_split\n",
    "from functools import partial\n",
    "from scipy.stats import pearsonr\n",
    "import matplotlib.pyplot as plt\n",
    "import pickle\n",
    "import os\n",
    "from matplotlib import rc\n",
    "import matplotlib\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fs = 10 # font size\n",
    "fs_label = 10 # tick label size\n",
    "fs_lgd = 10 # legend font size\n",
    "ss = 20 # symbol size\n",
    "ts = 3 # tick size\n",
    "slw = 1 # symbol line width\n",
    "framelw = 1 # line width of frame\n",
    "lw = 2 # line width of the bar box\n",
    "rc('axes', linewidth=framelw)\n",
    "plt.rcParams.update({\n",
    "    \"text.usetex\": False,\n",
    "    \"font.weight\":\"bold\",\n",
    "    \"axes.labelweight\":\"bold\",\n",
    "    \"font.size\":fs,\n",
    "    'pdf.fonttype':'truetype'\n",
    "})\n",
    "plt.rcParams['mathtext.fontset']='stix'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Download Data and load data into Pandas\n",
    "\n",
    "We will use the Supporting Information Data of Download the data from the following reference, \n",
    "\n",
    "* Fang Liu, Chenru Duan, and Heather J. Kulik\n",
    "The Journal of Physical Chemistry Letters 2020 11 (19), 8067-8076\n",
    "DOI: 10.1021/acs.jpclett.0c02288\n",
    "\n",
    "The Supporting Information data is free of charge and available for research use. Execute the next cell to download and unzip the dataset. Alternatively you can manually download and extract the dataset from following URL https://pubs.acs.org/doi/suppl/10.1021/acs.jpclett.0c02288/suppl_file/jz0c02288_si_002.zip and then upload to mybinder notebook by following these steps: \n",
    "1. click on Jupyter logo in the top left corner \n",
    "2. navigate to JupyterNotebook/models/ \n",
    "3. upload the locally downloaded and extracted dataset by click on the up arrow below the Run menu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%capture\n",
    "!rm -rf models\n",
    "!mkdir models\n",
    "!wget --keep-session-cookies --save-cookies=cookie.txt \"https://pubs.acs.org/doi/10.1021/acs.jpclett.0c02288\" \n",
    "!wget --referer=\"https://pubs.acs.org/doi/10.1021/acs.jpclett.0c02288\" --cookies=on --keep-session-cookies --load-cookies=cookie.txt https://pubs.acs.org/doi/suppl/10.1021/acs.jpclett.0c02288/suppl_file/jz0c02288_si_002.zip -P models\n",
    "!unzip models/jz0c02288_si_002.zip -d models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "datapath = '' # path to your data folder\n",
    "modelpath = os.path.join(datapath,'models')\n",
    "filein=modelpath+'/Data/refined_datasets/refined_features.csv' # read in the CSV file containing the features. This file is just for example\n",
    "df_features = pd.read_csv(filein)\n",
    "\n",
    "filein2=modelpath+'/Data/refined_datasets/refined_properties.csv' # read in the CSV file containing the properties. This file is just for example\n",
    "df_props = pd.read_csv(filein2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Take a look at the features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_features.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_features.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Take a look at the properties"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_props.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_props.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Take a look at all the $r_{ND}$ diagnostic values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_props['rND'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df_props['rND'].values)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Take a look at the HOMO-LUMO gap values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_props['energeticGap (eV)'].values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Plot $r_{ND}$ vs HOMO-LUMO gap. Is there any correlation between them?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(3.3,3.3))\n",
    "gap = df_props['energeticGap (eV)'].values\n",
    "rnd = df_props['rND'].values\n",
    "plt.scatter(gap,rnd,edgecolors=None,c='b',alpha=0.2)\n",
    "plt.ylabel('$r_{ND}$ (unitless)')\n",
    "plt.xlabel('gap (eV)')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Next we are going to try different ways to predict rND"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Predict $r_{ND}$ from HOMO-LUMO gap with linear regression"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "extract $r_{ND}$ and gap (eV) from database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df_props['energeticGap (eV)'].values.reshape(-1, 1)\n",
    "y = df_props['rND'].values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "split into train and test dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "train linear regression model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reg=linear_model.LinearRegression()\n",
    "reg=reg.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "use linear regression to predict test data points"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_predict = reg.predict(X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Mean square error for linear regression with one feature:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(np.square(y_test-y_predict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "fig = plt.figure(figsize=(3.3,3.3))\n",
    "plt.scatter(y_test,y_predict,edgecolors=None,c='b',alpha=0.2)\n",
    "xmin=0.1; xmax=0.6\n",
    "diag=np.linspace(xmin, xmax,100)\n",
    "plt.plot(diag,diag, color='gray')\n",
    "plt.xlim((xmin,xmax))\n",
    "plt.ylim((xmin,xmax))\n",
    "plt.ylabel('$r_{ND}$ (predicted)')\n",
    "plt.xlabel('$r_{ND}$ (real)')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Predict $r_{ND}$ from all molecular features (157) from the data set with linear regression"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We expect more features will improve the result over just one feature above."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Figure out which features are used in original database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def return_train_columns(df, cols_selected=False):\n",
    "    keys = ['RACs']\n",
    "    removed_columns = []\n",
    "    if not cols_selected:\n",
    "        return_columns = ['ligcharge', 'ox', 'spin']\n",
    "        for col in df.columns:\n",
    "            for key in keys:\n",
    "                if key in col and 'init' not in col and 'misc' not in col:\n",
    "                    if 'Zeff' not in col and '-O-' not in col:\n",
    "                        return_columns.append(col)\n",
    "    else:\n",
    "        print(\"Using input columns.\")\n",
    "        return_columns = cols_selected\n",
    "    print(\"inital: \", len(return_columns))\n",
    "    df = df.dropna(subset=return_columns)\n",
    "    thre = 1e-4\n",
    "    final_cols = []\n",
    "    for col in return_columns:\n",
    "        std = np.std(df[col].values)\n",
    "        if std < thre:\n",
    "            removed_columns.append(col)\n",
    "        else:\n",
    "            final_cols.append(col)\n",
    "    print(\"removed: \", removed_columns, len(removed_columns))\n",
    "    print(\"feature_used:\", final_cols, len(final_cols))\n",
    "    return final_cols, df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "extract all features from database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cols_use, df = return_train_columns(df_features, cols_selected=False)\n",
    "X = np.array(df_features[cols_use].values)\n",
    "y= df_props['rND'].values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "train-test split and linear regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)\n",
    "reg=linear_model.LinearRegression()\n",
    "reg.fit(X_train, y_train)\n",
    "y_predict = reg.predict(X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "Using all features improves the result compared to using only one feature. Mean square error for all features:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(np.square(y_test-y_predict))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The number of features with non-zero contribution is high"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.where(reg.coef_!=0)[0].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(3.3,3.3))\n",
    "plt.scatter(y_test,y_predict,edgecolors=None,c='b',alpha=0.2)\n",
    "xmin=0.1; xmax=0.6\n",
    "diag=np.linspace(xmin, xmax,100)\n",
    "plt.plot(diag,diag, color='gray')\n",
    "plt.xlim((xmin,xmax))\n",
    "plt.ylim((xmin,xmax))\n",
    "plt.ylabel('$r_{ND}$ (predicted)')\n",
    "plt.xlabel('$r_{ND}$ (real)')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Predict $r_{ND}$ with Elastic Net Regularization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import ElasticNet\n",
    "alpha = 0.001\n",
    "reg = ElasticNet(alpha=alpha, l1_ratio=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reg.fit(X_train, y_train)\n",
    "y_predict = reg.predict(X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "After adding Elastic Net regularization the error in this case study increases, but the number of features with non-zero contribution is significantly lower."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(np.square(y_test-y_predict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.where(reg.coef_!=0)[0].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(3.3,3.3))\n",
    "plt.scatter(y_test,y_predict,edgecolors=None,c='b',alpha=0.2)\n",
    "xmin=0.1; xmax=0.6\n",
    "diag=np.linspace(xmin, xmax,100)\n",
    "plt.plot(diag,diag, color='gray')\n",
    "plt.xlim((xmin,xmax))\n",
    "plt.ylim((xmin,xmax))\n",
    "plt.ylabel('$r_{ND}$ (predicted)')\n",
    "plt.xlabel('$r_{ND}$ (real)')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Predict $r_{ND}$ from selected molecule features (65 optimal features selected based on performance) with linear regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rfa_features = [\"ligcharge\",\"ox\",\"spin\",\"RACs.mc-Z-0-all\",\"RACs.mc-S-0-all\",\"RACs.f-chi-3-all\",\"RACs.D_mc-Z-1-all\",\"RACs.D_mc-S-1-all\",\"RACs.f-Z-1-all\",\"RACs.f-Z-0-all\",\"RACs.f-chi-1-all\",\"RACs.f-chi-0-all\",\"RACs.f-chi-2-all\",\"RACs.f-Z-2-all\",\"RACs.D_mc-chi-2-all\",\"RACs.mc-chi-2-all\",\"RACs.mc-chi-1-all\",\"RACs.f-S-0-all\",\"RACs.f-S-2-all\",\"RACs.D_mc-chi-1-all\",\"RACs.mc-Z-1-all\",\"RACs.f-S-1-all\",\"RACs.f-S-3-all\",\"RACs.f-Z-3-all\",\"RACs.D_mc-S-2-all\",\"RACs.D_mc-Z-2-all\",\"RACs.f-I-2-all\",\"RACs.f-I-0-all\",\"RACs.mc-Z-2-all\",\"RACs.f-chi-0-eq\",\"RACs.f-I-3-all\",\"RACs.f-T-3-all\",\"RACs.lc-S-3-eq\",\"RACs.mc-chi-3-all\",\"RACs.f-T-0-all\",\"RACs.D_mc-chi-3-all\",\"RACs.D_mc-S-3-all\",\"RACs.D_mc-Z-3-all\",\"RACs.f-I-1-all\",\"RACs.lc-chi-2-eq\",\"RACs.D_lc-S-2-ax\",\"RACs.mc-T-2-all\",\"RACs.D_lc-S-2-eq\",\"RACs.D_lc-chi-2-ax\",\"RACs.D_lc-Z-2-ax\",\"RACs.D_lc-Z-2-eq\",\"RACs.D_lc-chi-2-eq\",\"RACs.f-Z-3-eq\",\"RACs.lc-T-3-eq\",\"RACs.mc-I-3-all\",\"RACs.f-Z-3-ax\",\"RACs.D_lc-T-3-ax\",\"RACs.mc-T-3-all\",\"RACs.mc-chi-0-all\",\"RACs.mc-S-1-all\",\"RACs.f-T-2-all\",\"RACs.f-T-1-all\",\"RACs.mc-Z-3-all\",\"RACs.f-T-3-ax\",\"RACs.mc-S-2-all\",\"RACs.f-Z-1-ax\",\"RACs.mc-S-3-all\",\"RACs.D_mc-T-3-all\",\"RACs.f-Z-2-ax\",\"RACs.f-T-3-eq\"]\n",
    "\n",
    "cols_use, df = return_train_columns(df_features, cols_selected=rfa_features)\n",
    "X = np.array(df_features[cols_use].values)\n",
    "y= df_props['rND'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)\n",
    "reg=linear_model.LinearRegression()\n",
    "reg.fit(X_train, y_train)\n",
    "y_predict = reg.predict(X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "Using optimal features improves the result compared to using all features. Mean square error for optimal features:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(np.square(y_test-y_predict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(3.3,3.3))\n",
    "plt.scatter(y_test,y_predict,edgecolors=None,c='b',alpha=0.2)\n",
    "xmin=0.1; xmax=0.6\n",
    "diag=np.linspace(xmin, xmax,100)\n",
    "plt.plot(diag,diag, color='gray')\n",
    "plt.xlim((xmin,xmax))\n",
    "plt.ylim((xmin,xmax))\n",
    "plt.ylabel('$r_{ND}$ (predicted)')\n",
    "plt.xlabel('$r_{ND}$ (real)')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Comparison with Kernel ridge regression (KRR) model to predict $r_{ND}$ from the RACS features "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def prepare_data(df, x_scaler, cols_selected=False):\n",
    "    np.random.seed(0)\n",
    "    cols_use, df = return_train_columns(df, cols_selected)\n",
    "    X = np.array(df[cols_use].values)\n",
    "    X_scaled = x_scaler.transform(X)\n",
    "    return X_scaled, cols_use\n",
    "\n",
    "def predict(model_filename, df, x_scaler_filename, y_scaler_filename,y , cols_selected=False):\n",
    "    x_scaler = pickle.load(open(x_scaler_filename,'rb'))\n",
    "    y_scaler = pickle.load(open(y_scaler_filename,'rb'))\n",
    "    krr = pickle.load(open(model_filename,'rb'))\n",
    "    X_scaled, cols_use =  prepare_data(df, x_scaler, cols_selected=cols_selected)\n",
    "    X_scaled_train, X_scaled_test, y_train, y_test = train_test_split(X_scaled,y, test_size=0.2, random_state=1)\n",
    "    y_predict_scaled = krr.predict(X_scaled_test)\n",
    "    y_predict = y_scaler.inverse_transform(y_predict_scaled)[:,0]\n",
    "    return y_predict, y_test\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "modelfile = os.path.join(modelpath,'Data/models/KRR/MD2_rND_krr_model/MD2_rND_krr.pkl')\n",
    "x_scaler_filename =  os.path.join(modelpath,'Data/models/KRR/MD2_rND_krr_model/x_scaler.pkl')\n",
    "y_scaler_filename =  os.path.join(modelpath,'Data/models/KRR/MD2_rND_krr_model/y_scaler.pkl')\n",
    "# Note we are not using all features. We are using the selected features verified to be optimal\n",
    "rfa_features = [\"ligcharge\",\"ox\",\"spin\",\"RACs.mc-Z-0-all\",\"RACs.mc-S-0-all\",\"RACs.f-chi-3-all\",\"RACs.D_mc-Z-1-all\",\"RACs.D_mc-S-1-all\",\"RACs.f-Z-1-all\",\"RACs.f-Z-0-all\",\"RACs.f-chi-1-all\",\"RACs.f-chi-0-all\",\"RACs.f-chi-2-all\",\"RACs.f-Z-2-all\",\"RACs.D_mc-chi-2-all\",\"RACs.mc-chi-2-all\",\"RACs.mc-chi-1-all\",\"RACs.f-S-0-all\",\"RACs.f-S-2-all\",\"RACs.D_mc-chi-1-all\",\"RACs.mc-Z-1-all\",\"RACs.f-S-1-all\",\"RACs.f-S-3-all\",\"RACs.f-Z-3-all\",\"RACs.D_mc-S-2-all\",\"RACs.D_mc-Z-2-all\",\"RACs.f-I-2-all\",\"RACs.f-I-0-all\",\"RACs.mc-Z-2-all\",\"RACs.f-chi-0-eq\",\"RACs.f-I-3-all\",\"RACs.f-T-3-all\",\"RACs.lc-S-3-eq\",\"RACs.mc-chi-3-all\",\"RACs.f-T-0-all\",\"RACs.D_mc-chi-3-all\",\"RACs.D_mc-S-3-all\",\"RACs.D_mc-Z-3-all\",\"RACs.f-I-1-all\",\"RACs.lc-chi-2-eq\",\"RACs.D_lc-S-2-ax\",\"RACs.mc-T-2-all\",\"RACs.D_lc-S-2-eq\",\"RACs.D_lc-chi-2-ax\",\"RACs.D_lc-Z-2-ax\",\"RACs.D_lc-Z-2-eq\",\"RACs.D_lc-chi-2-eq\",\"RACs.f-Z-3-eq\",\"RACs.lc-T-3-eq\",\"RACs.mc-I-3-all\",\"RACs.f-Z-3-ax\",\"RACs.D_lc-T-3-ax\",\"RACs.mc-T-3-all\",\"RACs.mc-chi-0-all\",\"RACs.mc-S-1-all\",\"RACs.f-T-2-all\",\"RACs.f-T-1-all\",\"RACs.mc-Z-3-all\",\"RACs.f-T-3-ax\",\"RACs.mc-S-2-all\",\"RACs.f-Z-1-ax\",\"RACs.mc-S-3-all\",\"RACs.D_mc-T-3-all\",\"RACs.f-Z-2-ax\",\"RACs.f-T-3-eq\"]\n",
    "y_predict, y_test = predict(modelfile, df_features, x_scaler_filename, y_scaler_filename,y, cols_selected=rfa_features)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using Kernel ridge regression (KRR) improves the result compared to linear regression. Mean square error for KRR model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(np.square(y_test-y_predict))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Improved result is visible in the plot."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.scatter(y_test,y_predict,edgecolors=None,c='b',alpha=0.2)\n",
    "xmin=0.1; xmax=0.6\n",
    "diag=np.linspace(xmin, xmax,100)\n",
    "plt.plot(diag,diag, color='gray')\n",
    "plt.xlim((xmin,xmax))\n",
    "plt.ylim((xmin,xmax))\n",
    "plt.ylabel('$r_{ND}$ (predicted)')\n",
    "plt.xlabel('$r_{ND}$ (real)')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}