{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "from tqdm import tqdm_notebook\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics import mean_absolute_error\n", "from scipy.sparse import csr_matrix, hstack\n", "from sklearn.linear_model import RidgeCV, Ridge\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn import preprocessing\n", "import scipy\n", "import seaborn as sns\n", "from matplotlib import pyplot as plt\n", "from sklearn.model_selection import train_test_split \n", "from sklearn.tree import DecisionTreeRegressor\n", "import math\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Individual project of Ekaterina Chebeneva" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Introduction\n", "### 1 Feature and data explanation\n", "\n", "https://www.kaggle.com/spscientist/students-performance-in-exams/home\n", "\n", "This data set consists of the marks secured by the students in various subjects.\n", "\n", "Aim of the project is to understand the influence of various factors on the students performance and to predict the average score in three subjects" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Data description\n", "\n", "- gender: male and female\n", "- race/ethnicity\n", "- parental level of education\n", "- lunch: lunch level, Standard or Free/Reduced\n", "- test preparation course: presumably, \"none\" means that the student did not attend preparatory courses, and \"Completed\" means that they completed it.\n", "- math score\n", "- reading score\n", "- writing score\n", "- **avg_score: target feature, which is the average score for three subject**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('Downloads/StudentsPerformance.csv')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['avg_score'] = (df['math score'] + df['reading score'] + df['writing score']) / 3\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.unique(df['test preparation course'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.unique(df['parental level of education'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.isnull().sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As seen above, there are no null values in this datafram" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(20, 10))\n", "sns.heatmap(df.corr())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(20, 10))\n", "sns.boxplot(x='gender', y='avg_score', data=df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(20, 10))\n", "sns.boxplot(y='avg_score', x='test preparation course', hue='gender', data=df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(20, 10))\n", "sns.countplot(x='race/ethnicity', data=df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(20, 10))\n", "sns.countplot(x='parental level of education', data=df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sns.pairplot(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.unique(df['parental level of education'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sns.barplot(x='avg_score', hue='parental level of education', data=df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Function that prints summary statistics of column given in parameters\n", "def summary_statistics(col, df):\n", " print('Mean: {}'.format(df[col].mean()))\n", " print('Max: {}'.format(df[col].max()))\n", " print('Min: {}'.format(df[col].min()))\n", " print('Median: {}'.format(df[col].median()))\n", " print()\n", " \n", " # Number of students with maximum and minimum score\n", " max_score = df[col].max()\n", " min_score = df[col].min()\n", " print(\"Number of students who scored maximum score: {}\".format(df[col][df[col]==max_score].count()))\n", " print(\"Number of students who scored minimum score: {}\".format(df[col][df[col]==min_score].count()))\n", " print()\n", " \n", " # Students close to mean i.e. Students that have scores equal to floor(mean score) or ceiling(mean score)\n", " near_mean_floor = math.floor(df[col].mean())\n", " near_mean_ceil = math.ceil(df[col].mean())\n", " near_mean_tot = df[col][df[col]==near_mean_floor].count() + df[col][df[col]==near_mean_ceil].count()\n", " print(\"Number of students close to mean score: {}\".format(near_mean_tot))\n", " print()\n", " \n", " # Students that have 50th percentile\n", " print(\"Number of students at median score: {}\".format(df[col][df[col]==df[col].median()].count()))\n", " \n", " # Students with 25th percentile and 75th percentile scores\n", " print(\"Number of students at 25th percentile: {}\".format(df[col][df[col]==df[col].quantile(0.25)].count()))\n", " print(\"Number of students at 75th percentile: {}\".format(df[col][df[col]==df[col].quantile(0.75)].count()))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "summary_statistics(\"math score\", df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "summary_statistics(\"reading score\", df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "summary_statistics(\"writing score\", df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Does completing course really affects the score?**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Students that have more than median marks in \n", "\n", "# Maths\n", "print(\"Maths\")\n", "df_top_math = df[df[\"math score\"] > df[\"math score\"].median()]\n", "print(df_top_math[\"test preparation course\"].value_counts())\n", "print()\n", "\n", "# Reading\n", "print(\"Reading\")\n", "df_top_read = df[df[\"reading score\"] > df[\"reading score\"].median()]\n", "print(df_top_read[\"test preparation course\"].value_counts())\n", "print()\n", "\n", "# Writing\n", "print(\"Writing\")\n", "df_top_writ = df[df[\"writing score\"] > df[\"writing score\"].median()]\n", "print(df_top_writ[\"test preparation course\"].value_counts())\n", "print()\n", "\n", "print(\"Average score\")\n", "df_top_writ = df[df[\"avg_score\"] > df[\"avg_score\"].median()]\n", "print(df_top_writ[\"test preparation course\"].value_counts())\n", "print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can see that there is almost a ratio of 1:1 for students who have completed the course and students who have not completed the course (for more than median score). So, we can set a hypothesis like Possibility of scoring more than median marks in test remains unchanged after completing.\n", "But this cannot be stated as a final conclusion." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Students that have less than or equal to median marks in \n", "\n", "# Maths\n", "print(\"Maths\")\n", "df_bot_math = df[df[\"math score\"] <= df[\"math score\"].median()]\n", "print(df_bot_math[\"test preparation course\"].value_counts())\n", "print()\n", "\n", "# Reading\n", "print(\"Reading\")\n", "df_bot_read = df[df[\"reading score\"] <= df[\"reading score\"].median()]\n", "print(df_bot_read[\"test preparation course\"].value_counts())\n", "print()\n", "\n", "# Writing\n", "print(\"Writing\")\n", "df_bot_writ = df[df[\"writing score\"] <= df[\"writing score\"].median()]\n", "print(df_bot_writ[\"test preparation course\"].value_counts())\n", "print()\n", "\n", "print(\"Average score\")\n", "df_top_writ = df[df[\"avg_score\"] <= df[\"avg_score\"].median()]\n", "print(df_top_writ[\"test preparation course\"].value_counts())\n", "print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For the students that have less than or equal to median marks the hypothesis assumed above has been proven wrong. As, there is a ratio of 2:1 for the students that have not completed the course to the students who have completed the course. So our new hypothesis could be that\n", "You are twice as likely to score less than median marks if you don't complete the test preperation course\n", "But this too, cannot be stated as final conclusion." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def graphs(score_type, suptitle, groupbyterm, kind):\n", " nrows = 2\n", " ncols = 3\n", " inches = 5\n", " df_female = df[df['gender'] == 'female']\n", " df_male = df[df['gender'] == 'male']\n", " fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols*inches,nrows*inches))\n", " fig.suptitle(suptitle)\n", " temp = df[groupbyterm].value_counts().rename(\"\")\n", " temp.plot.pie(ax=axes[0,0], title=\"Students Overall\", autopct=\"%.2f\", legend=False)\n", " temp = df_female[groupbyterm].value_counts().rename(\"\")\n", " temp.plot.pie(ax=axes[0,1], title=\"Female\", autopct=\"%.2f\", legend=False)\n", " temp = df_male[groupbyterm].value_counts().rename(\"\")\n", " temp.plot.pie(ax=axes[0,2], title=\"Male\", autopct=\"%.2f\", legend=False)\n", " pd.concat([\n", " df_female.groupby(groupbyterm)[score_type].mean().rename(\"Female\"),\n", " df_male.groupby(groupbyterm)[score_type].mean().rename(\"Male\")], axis=1).plot(kind=\"bar\", ax=axes[1,0], legend=True)\n", " axes[1,0].set_xlabel(\"\")\n", " axes[1,0].legend([x.get_text().capitalize() for x in axes[1,0].legend().get_texts()])\n", " axes[1,0].set_xticklabels([x.get_text().capitalize() for x in axes[1,0].get_xticklabels()])\n", " df_female.groupby(groupbyterm)[score_type].plot(kind=kind, ax=axes[1,1], legend=True, alpha=0.8, histtype=\"step\")\n", " axes[1,1].set_xlabel(\"Female Scores\")\n", " axes[1,1].set_ylabel(\"\")\n", " axes[1,1].set_xticks(np.arange(0, 101, step=20))\n", " axes[1,1].set_yticks(np.arange(0, 101, step=20))\n", " axes[1,1].legend([x.get_text().capitalize() for x in axes[1,1].legend().get_texts()])\n", " df_male.groupby(groupbyterm)[score_type].plot(kind=kind, ax=axes[1,2], legend=True, alpha=0.8, histtype=\"step\")\n", " axes[1,2].set_xlabel(\"Male Scores\")\n", " axes[1,2].set_ylabel(\"\")\n", " axes[1,2].set_xticks(np.arange(0, 101, step=20))\n", " axes[1,2].set_yticks(np.arange(0, 101, step=20))\n", " axes[1,2].legend([x.get_text().capitalize() for x in axes[1,2].legend().get_texts()])\n", " return fig, axes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, axes = graphs(\"avg_score\",\"Avg Scores by Parental Level of Education\", \"parental level of education\", \"hist\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Very few parents hold a Bachelor's degree; fewer still, hold Master's degree. Because we have fewer samples of parents holding math degrees, it stands to show that the scores for these students will be higher on average, as opposed to the other degrees. But looking at the more populated degrees, their scores also show variations, albeit not a lot, but variations nevertheless." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, axes = graphs(\"avg_score\",\"Avg Scores by Race/Ethnicity\", \"race/ethnicity\", \"hist\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Shortest population in Race/Ethnic group A. And in terms of performance, they are still at the lowest on average. Most dense population in Race/Ethnic group C. In terms of performance, they're average." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, axes = graphs(\"avg_score\",\"Avg Scores by Lunch\", \"lunch\", \"hist\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Following the averages, boys still outperform the girls at math regardless of lunch. Almost two-thirds of the students had a standard lunch. It's good to know that these students didn't sacrifice their stomachs for scores.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, axes = graphs(\"avg_score\",\"Avg Scores by Test Preparation Course\", \"test preparation course\", \"hist\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Going off of the numbers in the graphs, Only a third of the students are taking these courses seriously. And the ones that did take the courses aren't having a significant advantage at scores. So should the Preparatory courses be cancelled? Looking at the smaller bounds, Students that took the preparatory course have higher baselines than the ones that didn't. So it's probably good to keep them going." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Distribution of student scores**\n", "\n", "All the scores are approximately normally distributed. Q-Q plots show skewness in both directions, indicating deviation from normal distribution in those regions.\n", "Joint distributions show strong correlation between test scores among different subjects which is not surprising." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "score_cols = ['math score', 'reading score','writing score', 'avg_score']\n", "from scipy.stats import norm\n", "\n", "\n", "def Plot_Dist(df, col):\n", " fig,axarr = plt.subplots(1,2,figsize=(12,4))\n", " # plot distribution\n", " sns.distplot(df[col], fit=norm, kde=False, ax=axarr[0])\n", " #Q-Q plot\n", " from statsmodels.graphics.gofplots import qqplot\n", " qqplot(df['math score'],line='s', ax=axarr[1])\n", " fig.suptitle(col+' distribution', fontsize=14)\n", " plt.show()\n", "\n", "Plot_Dist(df,col='math score')\n", "Plot_Dist(df,col='reading score')\n", "Plot_Dist(df,col='writing score')\n", "Plot_Dist(df,col='avg_score')\n", "\n", "\n", "ax1=sns.jointplot(x=\"math score\", y=\"reading score\", data=df)\n", "plt.show()\n", "\n", "ax2=sns.jointplot(x=\"math score\", y=\"writing score\", data=df)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Variation of Scores with Gender of the student**\n", "\n", "Female students beat male counter parts in reading and writing. In math, boys on an average do better than girls." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def Plot_Set(df, xcol, ycols):\n", " df = df.sort_values(by=xcol)\n", " fig,axarr = plt.subplots(1, 4,figsize=(12,5))\n", " for id,ycol in enumerate(ycols):\n", " medians = df.groupby([xcol])[ycol].median().values\n", " median_labels = [str(np.round(s, 2)) for s in medians]\n", " pos = range(len(medians))\n", " sns.boxplot(x=xcol, y=ycol, data=df, width=0.5, palette='Set3', ax=axarr[id], linewidth=0.5)\n", " for tick,label in zip(pos,axarr[id].get_xticklabels()):\n", " axarr[id].text(pos[tick], medians[tick] + 0.5, median_labels[tick], horizontalalignment='center', size='medium', color='k', weight='semibold')\n", " axarr[id].set_ylim([0,105])\n", " plt.setp(axarr[id].get_xticklabels(), rotation=25,ha='right')\n", " plt.tight_layout()\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Plot_Set(df, xcol='gender', ycols=['math score','reading score','writing score', 'avg_score'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Variation of Scores with race/ethnicity**\n", "\n", "Race has a significant influence on test scores. For all subjects, students in group E perform better than students from other ethnicity." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Plot_Set(df,xcol='race/ethnicity',ycols=['math score','reading score','writing score', 'avg_score'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Variation of Scores with parental level of education**\n", "\n", "Education level of parents has a direct impact on the test scores. Higher the education level of the parent, higher the student scores." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Plot_Set(df,xcol='parental level of education',ycols=['math score','reading score','writing score', 'avg_score'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Variation of Scores with lunch type**\n", "\n", "Whether the student gets the standard lunch or free/reduced lunch has an impact on the scores. It is evident students from lower income families on an average have 5-10 points lower scores than those who can afford standard lunches." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Plot_Set(df,xcol='lunch',ycols=['math score','reading score','writing score', 'avg_score'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Prediction \n", "\n", "MAE and RMSE are the two most popular metrics for continuous variables. Let’s start with the more popular one.\n", "\n", "It is easy to interpret MAE because it directly takes the average of offsets whereas RMSE penalizes the higher difference more than MAE. Therefore, I chose mse as the metric\n", "\n", "\n", "Decision tree selected as model for prediction.\n", "\n", "Tree classification techniques, when they \"work\" and produce accurate predictions or predicted classifications based on few logical if-then conditions, have a number of advantages over many of those alternative techniques. Simplicity of results. In most cases, the interpretation of results summarized in a tree is very simple. This simplicity is useful not only for purposes of rapid classification of new observations (it is much easier to evaluate just one or two logical conditions, than to compute classification scores for each possible group, or predicted values, based on all predictors and using possibly some complex nonlinear model equations), but can also often yield a much simpler \"model\" for explaining why observations are classified or predicted in a particular manner (e.g., when analyzing business problems, it is much easier to present a few simple if-then statements to management, than some elaborate equations).\n", "\n", "For data preprocessing is used one hot encoding for categorical features." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.concat([df, pd.get_dummies(df.gender, prefix='gender_')], axis=1)\n", "df = pd.concat([df, pd.get_dummies(df['race/ethnicity'], prefix='race_')], axis=1)\n", "df = pd.concat([df, pd.get_dummies(df['parental level of education'], prefix='edu_')], axis=1)\n", "df = pd.concat([df, pd.get_dummies(df['lunch'], prefix='lunch_')], axis=1)\n", "df = pd.concat([df, pd.get_dummies(df['test preparation course'], prefix='course_')], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.drop(columns=['gender', 'race/ethnicity','parental level of education', 'lunch', 'test preparation course'], inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "target = df.avg_score\n", "df.drop(columns=['math score', 'reading score', 'writing score', 'avg_score'], inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.33, random_state=17)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dtr = DecisionTreeRegressor(max_depth=4, random_state=17)\n", "\n", "cross_val_score(dtr, df, target, cv=10, scoring='neg_mean_squared_error')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dtr = DecisionTreeRegressor(max_depth=4, random_state=17)\n", "\n", "dtr.fit(X_train, y_train)\n", "\n", "y_pred = dtr.predict(X_test)\n", "mean_absolute_error(y_test, y_pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 2 }