{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from functools import reduce\n", "\n", "def _get_model_name(model):\n", " \"\"\"\n", " Returns a string with the name of a sklearn model\n", " model: Sklearn stimator class\n", " \"\"\"\n", " if isinstance(model, Pipeline):\n", " estimator = model.steps[-1][1]\n", " name = \"Pipeline_\" + str(estimator)[:str(estimator).find(\"(\")]\n", " else: \n", " name = str(model)[:str(model).find(\"(\")]\n", " return name\n", " \n", " \n", "def plot_cv_score(X, y, models_list, cv = 5, scoring_list = None, refit = True, return_scores = True):\n", " \"\"\" \n", " X: numpy_array/pandas dataframe n_rows, m_features\n", " y: numpy_array/pandas dataframe n_rows\n", " Plots min, max and avg kfold crosval_score for a list of models\n", " \n", " \"\"\"\n", " \n", " \n", " \n", " names, mean_score = list(), list()\n", " ldf = list()\n", " mnames = list()\n", " \n", " for i, model in enumerate(models_list):\n", " name = _get_model_name(model)\n", " \n", " if refit:\n", " model.fit(X, y)\n", " \n", " for metric in score_list:\n", " \n", " score = cross_val_score(model, X, y, cv = cv, scoring = metric, n_jobs= -1)\n", " mean_score.append(np.mean(score))\n", " \n", " \n", " tmp = pd.DataFrame({name: mean_score}, index = score_list)\n", " \n", " \n", " \n", " ldf.append(tmp)\n", " \n", " \n", " mean_score = list()\n", " \n", " frame_scores = reduce(lambda x,y: pd.merge(x,y, left_index = True, right_index = True), ldf).T\n", " print(frame_scores)\n", " \n", " \n", " fig, ax = plt.subplots(1,1, figsize = (10,5))\n", "\n", " frame_scores.plot.bar(ax = ax, cmap = 'RdYlBu', edgecolor = \"black\")\n", " ax.legend(loc = 'best')\n", " ax.set_ylabel(\"Score\")\n", " ax.set_title(\"Cross validation model benchmark\")\n", "\n", " if return_scores: \n", " return frame_scores" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score, f1_score, roc_auc_score\n", "from sklearn.datasets import load_breast_cancer\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.svm import SVC\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.pipeline import Pipeline\n", "\n", "X, y = load_breast_cancer(return_X_y= True)\n", "\n", "models_list =[LogisticRegression(random_state= 42),\n", " # SVC(probability= True),\n", " RandomForestClassifier(random_state = 42),\n", " GaussianNB()]\n", "\n", "score_list = [\"roc_auc\", \"accuracy\", \"f1\", \"precision\", \"recall\"]\n", "\n", "t = plot_cv_score(X = X, y = y, models_list = models_list, cv = 5, scoring_list = score_list, refit = True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# TEST\n", "import pandas as pd \n", " \n", "# assign data of lists. \n", "data = {'Accuracy': [0.9764, 0.9818, 0.9991, 0.9985, 0.9884], \n", " 'Precision':[0.9761, 0.9816, 0.9991, 0.9985, 0.9882],\n", " 'Recall':[0.9767, 0.9819, 0.9992, 0.9985, 0.9885],\n", " 'F1-Score':[0.9763, 0.9817, 0.9991, 0.9985, 0.9883],} \n", " \n", "# Creates pandas DataFrame. \n", "df = pd.DataFrame(data, index =['GradientBoostingClassifier', 'GradientBoostingRegressor', 'HistGradientBoostingClassifier', 'HistGradientBoostingRegressor', 'AdaBoostClassifier']) \n", " \n", "# print the data \n", "print(df) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots(1,1, figsize = (12,5))\n", "\n", "plt.rcParams[\"font.family\"] = \"Times New Roman\"\n", "df.plot.bar(ax = ax, cmap = 'GnBu', edgecolor = \"black\")\n", "plt.ylim(0.95,1)\n", "ax.legend(loc = 'best')\n", "ax.set_ylabel(\"Score\", fontproperties = \"Times New Roman\", fontweight = 'bold', fontsize = 14)\n", "ax.set_xlabel(\"Model\", fontproperties = \"Times New Roman\", fontweight = 'bold', fontsize = 14)\n", "plt.yticks(fontproperties = \"Times New Roman\", size = 12, weight = \"bold\")\n", "plt.xticks(fontproperties = \"Times New Roman\", size = 12, weight = \"bold\", rotation = 355)\n", "ax.set_title(\"Evaluation metrics among different models\", fontproperties = \"Times New Roman\", fontweight = 'bold', fontsize = 15)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.7.6 64-bit", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.7.6" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "3499bcc59dd07de3752bcaf4b431b7cc0c8d7df018f3c7c8f72730d6f0400322" } } }, "nbformat": 4, "nbformat_minor": 2 }