{ "cells": [ { "cell_type": "markdown", "id": "4615f208", "metadata": {}, "source": [ "### Model Training\n", "1.1 Import Data and Required Packages" ] }, { "cell_type": "code", "execution_count": 1, "id": "bbb5fb5c", "metadata": {}, "outputs": [], "source": [ "# Basic Import\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt \n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "id": "066470c9", "metadata": {}, "outputs": [], "source": [ "# Modelling\n", "from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n", "from sklearn.neighbors import KNeighborsRegressor\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor\n", "from sklearn.svm import SVR\n", "from sklearn.linear_model import LinearRegression, Ridge,Lasso\n", "from sklearn.model_selection import RandomizedSearchCV\n", "from catboost import CatBoostRegressor\n", "from xgboost import XGBRegressor\n", "import warnings" ] }, { "cell_type": "code", "execution_count": 3, "id": "41e4a32e", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('studentsperformance.csv')" ] }, { "cell_type": "code", "execution_count": 4, "id": "19a8f0b4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genderrace/ethnicityparental level of educationlunchtest preparation coursemath scorereading scorewriting score
0femalegroup Bbachelor's degreestandardnone727274
\n", "
" ], "text/plain": [ " gender race/ethnicity parental level of education lunch \\\n", "0 female group B bachelor's degree standard \n", "\n", " test preparation course math score reading score writing score \n", "0 none 72 72 74 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(1)" ] }, { "cell_type": "code", "execution_count": 5, "id": "e9cac137", "metadata": {}, "outputs": [], "source": [ "X = df.drop('math score', axis = 1)\n", "y = df['math score']" ] }, { "cell_type": "code", "execution_count": 6, "id": "540fab31", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genderrace/ethnicityparental level of educationlunchtest preparation coursereading scorewriting score
0femalegroup Bbachelor's degreestandardnone7274
\n", "
" ], "text/plain": [ " gender race/ethnicity parental level of education lunch \\\n", "0 female group B bachelor's degree standard \n", "\n", " test preparation course reading score writing score \n", "0 none 72 74 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.head(1)" ] }, { "cell_type": "code", "execution_count": 7, "id": "51144d44", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 72\n", "Name: math score, dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y.head(1)" ] }, { "cell_type": "markdown", "id": "01ee80ae", "metadata": {}, "source": [ "### Pipeline" ] }, { "cell_type": "code", "execution_count": 8, "id": "11627557", "metadata": {}, "outputs": [], "source": [ "# Create Column Transformer with 3 types of transformers\n", "num_features = X.select_dtypes(exclude=\"object\").columns\n", "cat_features = X.select_dtypes(include=\"object\").columns\n", "\n", "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", "from sklearn.compose import ColumnTransformer\n", "\n", "numeric_transformer = StandardScaler()\n", "oh_transformer = OneHotEncoder()\n", "\n", "preprocessor = ColumnTransformer(\n", " [\n", " (\"OneHotEncoder\", oh_transformer, cat_features),\n", " (\"StandardScaler\", numeric_transformer, num_features), \n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": 9, "id": "e5d3581d", "metadata": {}, "outputs": [], "source": [ "X = preprocessor.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": 10, "id": "ba9529a8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((800, 19), (200, 19))" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# separate dataset into train and test\n", "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)\n", "X_train.shape, X_test.shape" ] }, { "cell_type": "code", "execution_count": 11, "id": "8d3fbfa9", "metadata": {}, "outputs": [], "source": [ "# Create an Evaluate Function to give all metrics after model Training\n", "def evaluate_model(true, predicted):\n", " mae = mean_absolute_error(true, predicted)\n", " mse = mean_squared_error(true, predicted)\n", " rmse = np.sqrt(mse)\n", " r2_square = r2_score(true, predicted)\n", " return mae, rmse, r2_square" ] }, { "cell_type": "code", "execution_count": 12, "id": "18928356", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Linear Regression\n", "Model performance for Training set\n", "- Root Mean Squared Error: 5.3243\n", "- Mean Absolute Error: 4.2671\n", "- R2 Score: 0.8743\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 5.3960\n", "- Mean Absolute Error: 4.2158\n", "- R2 Score: 0.8803\n", "===================================\n", "\n", "\n", "Lasso\n", "Model performance for Training set\n", "- Root Mean Squared Error: 6.5938\n", "- Mean Absolute Error: 5.2063\n", "- R2 Score: 0.8071\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 6.5197\n", "- Mean Absolute Error: 5.1579\n", "- R2 Score: 0.8253\n", "===================================\n", "\n", "\n", "Ridge\n", "Model performance for Training set\n", "- Root Mean Squared Error: 5.3233\n", "- Mean Absolute Error: 4.2650\n", "- R2 Score: 0.8743\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 5.3904\n", "- Mean Absolute Error: 4.2111\n", "- R2 Score: 0.8806\n", "===================================\n", "\n", "\n", "K-Neighbors Regressor\n", "Model performance for Training set\n", "- Root Mean Squared Error: 5.7077\n", "- Mean Absolute Error: 4.5167\n", "- R2 Score: 0.8555\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 7.2530\n", "- Mean Absolute Error: 5.6210\n", "- R2 Score: 0.7838\n", "===================================\n", "\n", "\n", "Decision Tree\n", "Model performance for Training set\n", "- Root Mean Squared Error: 0.2795\n", "- Mean Absolute Error: 0.0187\n", "- R2 Score: 0.9997\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 7.9066\n", "- Mean Absolute Error: 6.3550\n", "- R2 Score: 0.7431\n", "===================================\n", "\n", "\n", "Random Forest Regressor\n", "Model performance for Training set\n", "- Root Mean Squared Error: 2.2808\n", "- Mean Absolute Error: 1.8341\n", "- R2 Score: 0.9769\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 5.9168\n", "- Mean Absolute Error: 4.5796\n", "- R2 Score: 0.8561\n", "===================================\n", "\n", "\n", "XGBRegressor\n", "Model performance for Training set\n", "- Root Mean Squared Error: 0.9087\n", "- Mean Absolute Error: 0.6148\n", "- R2 Score: 0.9963\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 6.5889\n", "- Mean Absolute Error: 5.0844\n", "- R2 Score: 0.8216\n", "===================================\n", "\n", "\n", "CatBoosting Regressor\n", "Model performance for Training set\n", "- Root Mean Squared Error: 3.0427\n", "- Mean Absolute Error: 2.4054\n", "- R2 Score: 0.9589\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 6.0086\n", "- Mean Absolute Error: 4.6125\n", "- R2 Score: 0.8516\n", "===================================\n", "\n", "\n", "AdaBoost Regressor\n", "Model performance for Training set\n", "- Root Mean Squared Error: 5.7813\n", "- Mean Absolute Error: 4.7602\n", "- R2 Score: 0.8517\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 6.0781\n", "- Mean Absolute Error: 4.7711\n", "- R2 Score: 0.8482\n", "===================================\n", "\n", "\n" ] } ], "source": [ "models = {\n", " \"Linear Regression\": LinearRegression(),\n", " \"Lasso\": Lasso(),\n", " \"Ridge\": Ridge(),\n", " \"K-Neighbors Regressor\": KNeighborsRegressor(),\n", " \"Decision Tree\": DecisionTreeRegressor(),\n", " \"Random Forest Regressor\": RandomForestRegressor(),\n", " \"XGBRegressor\": XGBRegressor(), \n", " \"CatBoosting Regressor\": CatBoostRegressor(verbose=False),\n", " \"AdaBoost Regressor\": AdaBoostRegressor()\n", "}\n", "model_list = []\n", "r2_list =[]\n", "\n", "for i in range(len(list(models))):\n", " model = list(models.values())[i]\n", " model.fit(X_train, y_train) # Train model\n", "\n", " # Make predictions\n", " y_train_pred = model.predict(X_train)\n", " y_test_pred = model.predict(X_test)\n", " \n", " # Evaluate Train and Test dataset\n", " model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)\n", "\n", " model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)\n", "\n", " \n", " print(list(models.keys())[i])\n", " model_list.append(list(models.keys())[i])\n", " \n", " print('Model performance for Training set')\n", " print(\"- Root Mean Squared Error: {:.4f}\".format(model_train_rmse))\n", " print(\"- Mean Absolute Error: {:.4f}\".format(model_train_mae))\n", " print(\"- R2 Score: {:.4f}\".format(model_train_r2))\n", " print('----------------------------------')\n", " \n", " print('Model performance for Test set')\n", " print(\"- Root Mean Squared Error: {:.4f}\".format(model_test_rmse))\n", " print(\"- Mean Absolute Error: {:.4f}\".format(model_test_mae))\n", " print(\"- R2 Score: {:.4f}\".format(model_test_r2))\n", " r2_list.append(model_test_r2)\n", " \n", " print('='*35)\n", " print('\\n')" ] }, { "cell_type": "code", "execution_count": 13, "id": "4deaafe9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Model NameR2_Score
2Ridge0.880593
0Linear Regression0.880345
5Random Forest Regressor0.856131
7CatBoosting Regressor0.851632
8AdaBoost Regressor0.848181
1Lasso0.825320
6XGBRegressor0.821589
3K-Neighbors Regressor0.783813
4Decision Tree0.743094
\n", "
" ], "text/plain": [ " Model Name R2_Score\n", "2 Ridge 0.880593\n", "0 Linear Regression 0.880345\n", "5 Random Forest Regressor 0.856131\n", "7 CatBoosting Regressor 0.851632\n", "8 AdaBoost Regressor 0.848181\n", "1 Lasso 0.825320\n", "6 XGBRegressor 0.821589\n", "3 K-Neighbors Regressor 0.783813\n", "4 Decision Tree 0.743094" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=[\"R2_Score\"],ascending=False)" ] }, { "cell_type": "markdown", "id": "2a181133", "metadata": {}, "source": [ "Linear Regression" ] }, { "cell_type": "code", "execution_count": 14, "id": "68b6de4b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Accuracy of the model is 88.03\n" ] } ], "source": [ "lin_model = LinearRegression(fit_intercept=True)\n", "lin_model = lin_model.fit(X_train, y_train)\n", "y_pred = lin_model.predict(X_test)\n", "score = r2_score(y_test, y_pred)*100\n", "print(\" Accuracy of the model is %.2f\" %score)" ] }, { "cell_type": "code", "execution_count": 15, "id": "8e89547a", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Plot y_pred and y_test\n", "\n", "plt.scatter(y_test,y_pred);\n", "plt.xlabel('Actual');\n", "plt.ylabel('Predicted');" ] }, { "cell_type": "code", "execution_count": 16, "id": "0ca263b6", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.regplot(x=y_test,y=y_pred,ci=None,color ='red');" ] }, { "cell_type": "code", "execution_count": 17, "id": "0af56bc7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Actual ValuePredicted ValueDifference
5219176.50781214.492188
7375358.953125-5.953125
7408076.9609383.039062
6607476.757812-2.757812
4118487.539062-3.539062
............
4085243.5468758.453125
3326262.031250-0.031250
2087467.9765626.023438
6136567.132812-2.132812
786162.492188-1.492188
\n", "

200 rows × 3 columns

\n", "
" ], "text/plain": [ " Actual Value Predicted Value Difference\n", "521 91 76.507812 14.492188\n", "737 53 58.953125 -5.953125\n", "740 80 76.960938 3.039062\n", "660 74 76.757812 -2.757812\n", "411 84 87.539062 -3.539062\n", ".. ... ... ...\n", "408 52 43.546875 8.453125\n", "332 62 62.031250 -0.031250\n", "208 74 67.976562 6.023438\n", "613 65 67.132812 -2.132812\n", "78 61 62.492188 -1.492188\n", "\n", "[200 rows x 3 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})\n", "pred_df" ] }, { "cell_type": "code", "execution_count": null, "id": "600f220b", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "2d5caac1", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 5 }