{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "25a4784b-3d3b-4870-bd32-83d32ec7d4de", "metadata": { "tags": [] }, "source": [ "# Uncovering Trends and Patterns in Raw World Happiness Data for Strategic Insights" ] }, { "attachments": {}, "cell_type": "markdown", "id": "693db1a6-6243-447a-90f3-3ddcbc180b0f", "metadata": { "tags": [] }, "source": [ "# Table of Contents" ] }, { "attachments": {}, "cell_type": "markdown", "id": "411b2289-d816-4c08-9fc9-08233887df4f", "metadata": {}, "source": [ "1. [Import Relevant Packages](#import)\n", "2. [Setup Notebook Configuration](#setup)\n", "3. [Load the Data Frames](#load)\n", "4. [Perform EDA (Exploratory Data Analysis)](#eda)\n", "5. [Cluster Our Data Frame](#cluster)\n", "6. [Feature (Column) Understanding](#feature)\n", "7. [Model Training and Evaluation](#model)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "e844e502-37e3-4cdc-9705-b927d798940c", "metadata": { "tags": [] }, "source": [ "\n", "## Import Relevant Packages:" ] }, { "cell_type": "code", "execution_count": 1, "id": "119db542-d7c3-47f8-83a1-be91de798c39", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The geodata is provided by © OpenStreetMap contributors and is made available here under the Open Database License (ODbL).\n" ] } ], "source": [ "# Core Python libraries:\n", "import pandas as pd\n", "import numpy as np\n", "from typing import Union\n", "import re\n", "import os\n", "import certifi\n", "\n", "# Visiualization libraries:\n", "import mercury as mr\n", "import pygwalker as pyg\n", "from matplotlib import pyplot as plt\n", "from matplotlib_inline.backend_inline import set_matplotlib_formats\n", "import scienceplots\n", "import seaborn as sns\n", "from lets_plot import *\n", "from lets_plot.bistro import *\n", "from lets_plot.geo_data import *\n", "\n", "# Machine Learning and Numerical Processing libraries:\n", "from tqdm import tqdm\n", "from xgboost import XGBRegressor\n", "from sklearn.cluster import KMeans\n", "from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler\n", "from sklearn.metrics import mean_squared_error, r2_score\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor, RandomForestRegressor\n", "from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV" ] }, { "attachments": {}, "cell_type": "markdown", "id": "0aa27a00-5427-4f24-96b5-49d03cd045d7", "metadata": {}, "source": [ "\n", "## Setup Notebook Configuration:" ] }, { "cell_type": "code", "execution_count": 2, "id": "8c804105-d309-4216-a9eb-0b180c2a459b", "metadata": {}, "outputs": [ { "data": { "application/mercury+json": "{\n \"widget\": \"App\",\n \"title\": \"Uncovering Trends and Patterns in Raw World Happiness Data for Strategic Insights!\",\n \"description\": \"This notebook aims \\nto analyze and understand the factors influencing the happiness levels of countries worldwide, as reported in the World Happiness Report. \\nThrough a detailed examination of indicators and behaviors associated with happiness, we seek to unveil trends and patterns. \\nThe insights and visualizations derived from this data exploration and mining are designed to guide strategic decision-making for \\ngovernmental and non-governmental leaders interested in enhancing societal well-being. By leveraging this information, leadership \\ncan enact policies and initiatives that directly target areas with potential for improvement.\",\n \"show_code\": false,\n \"show_prompt\": false,\n \"output\": \"app\",\n \"schedule\": \"\",\n \"notify\": \"{}\",\n \"continuous_update\": true,\n \"static_notebook\": false,\n \"show_sidebar\": true,\n \"full_screen\": true,\n \"allow_download\": true,\n \"model_id\": \"mercury-app\",\n \"code_uid\": \"App.0.40.24.2-rand6bae532a\"\n}", "text/html": [ "

Mercury Application

This output won't appear in the web app." ], "text/plain": [ "mercury.App" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Run \"$mercury run\" in terminal to view this notebook as an interactive web application!\n", "app = mr.App(title=\"Uncovering Trends and Patterns in Raw World Happiness Data for Strategic Insights!\", description=\"\"\"This notebook aims \n", "to analyze and understand the factors influencing the happiness levels of countries worldwide, as reported in the World Happiness Report. \n", "Through a detailed examination of indicators and behaviors associated with happiness, we seek to unveil trends and patterns. \n", "The insights and visualizations derived from this data exploration and mining are designed to guide strategic decision-making for \n", "governmental and non-governmental leaders interested in enhancing societal well-being. By leveraging this information, leadership \n", "can enact policies and initiatives that directly target areas with potential for improvement.\"\"\", show_code=False)\n", "\n", "pd.set_option('display.max_columns', None) # Display all the columns of a dataframe\n", "pd.set_option('expand_frame_repr', False) # Display the dataframe's records on the same line\n", "np.set_printoptions(linewidth=np.inf, threshold=np.inf) # Display the numerically processed dataframe's records on the same line\n", "os.environ['SSL_CERT_FILE'] = certifi.where()\n", "\n", "separator = \"\\n\" * 2 + \"#\" * 150 + \"\\n\"\n", "\n", "LetsPlot.setup_html()\n", "plt.style.use(['ieee', 'science', 'notebook'])\n", "plt.rcParams[\"figure.autolayout\"] = True\n", "set_matplotlib_formats('svg')" ] }, { "attachments": {}, "cell_type": "markdown", "id": "4fc31dff-b55a-492e-86d1-eafeb7ef1651", "metadata": {}, "source": [ "\n", "## Load the Data Frames:" ] }, { "cell_type": "code", "execution_count": 3, "id": "fa44ddd0-73bc-40b5-a668-eb80c4dddb13", "metadata": {}, "outputs": [], "source": [ "dfs = {\"current_df\": pd.read_csv(\"world_happiness_data/world-happiness-report-2021.csv\"),\n", " \"historic_df\": pd.read_csv(\"world_happiness_data/world-happiness-report.csv\")}" ] }, { "attachments": {}, "cell_type": "markdown", "id": "27ec4973-30f7-4083-ba9d-bf669cbf9c0d", "metadata": {}, "source": [ "\n", "## Perform EDA (Exploratory Data Analysis)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "f434b416-26eb-4ac9-acf2-bdce6896bd84", "metadata": {}, "source": [ "### Peek into the data:" ] }, { "cell_type": "code", "execution_count": 4, "id": "91ecb806-03ce-4ba0-b6ec-33b1890750e5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "*** current_df ***\n", "\n", " Country name Regional indicator Ladder score Standard error of ladder score upperwhisker lowerwhisker Logged GDP per capita Social support Healthy life expectancy Freedom to make life choices Generosity Perceptions of corruption Ladder score in Dystopia Explained by: Log GDP per capita Explained by: Social support Explained by: Healthy life expectancy Explained by: Freedom to make life choices Explained by: Generosity Explained by: Perceptions of corruption Dystopia + residual\n", "0 Finland Western Europe 7.842 0.032 7.904 7.780 10.775 0.954 72.0 0.949 -0.098 0.186 2.43 1.446 1.106 0.741 0.691 0.124 0.481 3.253\n", "1 Denmark Western Europe 7.620 0.035 7.687 7.552 10.933 0.954 72.7 0.946 0.030 0.179 2.43 1.502 1.108 0.763 0.686 0.208 0.485 2.868\n", "2 Switzerland Western Europe 7.571 0.036 7.643 7.500 11.117 0.942 74.4 0.919 0.025 0.292 2.43 1.566 1.079 0.816 0.653 0.204 0.413 2.839\n", "3 Iceland Western Europe 7.554 0.059 7.670 7.438 10.878 0.983 73.0 0.955 0.160 0.673 2.43 1.482 1.172 0.772 0.698 0.293 0.170 2.967\n", "4 Netherlands Western Europe 7.464 0.027 7.518 7.410 10.932 0.942 72.4 0.913 0.175 0.338 2.43 1.501 1.079 0.753 0.647 0.302 0.384 2.798 \n", "\n", "######################################################################################################################################################\n", "\n", " Country name Regional indicator Ladder score Standard error of ladder score upperwhisker lowerwhisker Logged GDP per capita Social support Healthy life expectancy Freedom to make life choices Generosity Perceptions of corruption Ladder score in Dystopia Explained by: Log GDP per capita Explained by: Social support Explained by: Healthy life expectancy Explained by: Freedom to make life choices Explained by: Generosity Explained by: Perceptions of corruption Dystopia + residual\n", "144 Lesotho Sub-Saharan Africa 3.512 0.120 3.748 3.276 7.926 0.787 48.700 0.715 -0.131 0.915 2.43 0.451 0.731 0.007 0.405 0.103 0.015 1.800\n", "145 Botswana Sub-Saharan Africa 3.467 0.074 3.611 3.322 9.782 0.784 59.269 0.824 -0.246 0.801 2.43 1.099 0.724 0.340 0.539 0.027 0.088 0.648\n", "146 Rwanda Sub-Saharan Africa 3.415 0.068 3.548 3.282 7.676 0.552 61.400 0.897 0.061 0.167 2.43 0.364 0.202 0.407 0.627 0.227 0.493 1.095\n", "147 Zimbabwe Sub-Saharan Africa 3.145 0.058 3.259 3.030 7.943 0.750 56.201 0.677 -0.047 0.821 2.43 0.457 0.649 0.243 0.359 0.157 0.075 1.205\n", "148 Afghanistan South Asia 2.523 0.038 2.596 2.449 7.695 0.463 52.493 0.382 -0.102 0.924 2.43 0.370 0.000 0.126 0.000 0.122 0.010 1.895 \n", "\n", "######################################################################################################################################################\n", "\n", "\n", "RangeIndex: 149 entries, 0 to 148\n", "Data columns (total 20 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Country name 149 non-null object \n", " 1 Regional indicator 149 non-null object \n", " 2 Ladder score 149 non-null float64\n", " 3 Standard error of ladder score 149 non-null float64\n", " 4 upperwhisker 149 non-null float64\n", " 5 lowerwhisker 149 non-null float64\n", " 6 Logged GDP per capita 149 non-null float64\n", " 7 Social support 149 non-null float64\n", " 8 Healthy life expectancy 149 non-null float64\n", " 9 Freedom to make life choices 149 non-null float64\n", " 10 Generosity 149 non-null float64\n", " 11 Perceptions of corruption 149 non-null float64\n", " 12 Ladder score in Dystopia 149 non-null float64\n", " 13 Explained by: Log GDP per capita 149 non-null float64\n", " 14 Explained by: Social support 149 non-null float64\n", " 15 Explained by: Healthy life expectancy 149 non-null float64\n", " 16 Explained by: Freedom to make life choices 149 non-null float64\n", " 17 Explained by: Generosity 149 non-null float64\n", " 18 Explained by: Perceptions of corruption 149 non-null float64\n", " 19 Dystopia + residual 149 non-null float64\n", "dtypes: float64(18), object(2)\n", "memory usage: 23.4+ KB\n", "None \n", "\n", "######################################################################################################################################################\n", "\n", "*** historic_df ***\n", "\n", " Country name year Life Ladder Log GDP per capita Social support Healthy life expectancy at birth Freedom to make life choices Generosity Perceptions of corruption Positive affect Negative affect\n", "0 Afghanistan 2008 3.724 7.370 0.451 50.80 0.718 0.168 0.882 0.518 0.258\n", "1 Afghanistan 2009 4.402 7.540 0.552 51.20 0.679 0.190 0.850 0.584 0.237\n", "2 Afghanistan 2010 4.758 7.647 0.539 51.60 0.600 0.121 0.707 0.618 0.275\n", "3 Afghanistan 2011 3.832 7.620 0.521 51.92 0.496 0.162 0.731 0.611 0.267\n", "4 Afghanistan 2012 3.783 7.705 0.521 52.24 0.531 0.236 0.776 0.710 0.268 \n", "\n", "######################################################################################################################################################\n", "\n", " Country name year Life Ladder Log GDP per capita Social support Healthy life expectancy at birth Freedom to make life choices Generosity Perceptions of corruption Positive affect Negative affect\n", "1944 Zimbabwe 2016 3.735 7.984 0.768 54.4 0.733 -0.095 0.724 0.738 0.209\n", "1945 Zimbabwe 2017 3.638 8.016 0.754 55.0 0.753 -0.098 0.751 0.806 0.224\n", "1946 Zimbabwe 2018 3.616 8.049 0.775 55.6 0.763 -0.068 0.844 0.710 0.212\n", "1947 Zimbabwe 2019 2.694 7.950 0.759 56.2 0.632 -0.064 0.831 0.716 0.235\n", "1948 Zimbabwe 2020 3.160 7.829 0.717 56.8 0.643 -0.009 0.789 0.703 0.346 \n", "\n", "######################################################################################################################################################\n", "\n", "\n", "RangeIndex: 1949 entries, 0 to 1948\n", "Data columns (total 11 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Country name 1949 non-null object \n", " 1 year 1949 non-null int64 \n", " 2 Life Ladder 1949 non-null float64\n", " 3 Log GDP per capita 1913 non-null float64\n", " 4 Social support 1936 non-null float64\n", " 5 Healthy life expectancy at birth 1894 non-null float64\n", " 6 Freedom to make life choices 1917 non-null float64\n", " 7 Generosity 1860 non-null float64\n", " 8 Perceptions of corruption 1839 non-null float64\n", " 9 Positive affect 1927 non-null float64\n", " 10 Negative affect 1933 non-null float64\n", "dtypes: float64(9), int64(1), object(1)\n", "memory usage: 167.6+ KB\n", "None \n", "\n", "######################################################################################################################################################\n", "\n" ] } ], "source": [ "for df in dfs:\n", " print(f\"*** {df} ***\\n\")\n", " print(dfs[df].head(), separator)\n", " print(dfs[df].tail(), separator)\n", " print(dfs[df].info(), separator)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "77787eb2-0f4d-4104-b41c-48ff52841bb1", "metadata": {}, "source": [ "### Clean column names as necessary:" ] }, { "cell_type": "code", "execution_count": 5, "id": "a54c1151-cc85-4d58-9aaa-de50241d4ab2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "*** current_df ***\n", "\n", "Old column names: ['Country name' 'Regional indicator' 'Ladder score' 'Standard error of ladder score' 'upperwhisker' 'lowerwhisker' 'Logged GDP per capita' 'Social support' 'Healthy life expectancy' 'Freedom to make life choices' 'Generosity' 'Perceptions of corruption' 'Ladder score in Dystopia' 'Explained by: Log GDP per capita' 'Explained by: Social support' 'Explained by: Healthy life expectancy' 'Explained by: Freedom to make life choices' 'Explained by: Generosity' 'Explained by: Perceptions of corruption' 'Dystopia + residual']\n", "New column names: ['Country_Name' 'Regional_Indicator' 'Ladder_Score' 'Standard_Error_Of_Ladder_Score' 'Upperwhisker' 'Lowerwhisker' 'Logged_Gdp_Per_Capita' 'Social_Support' 'Healthy_Life_Expectancy' 'Freedom_To_Make_Life_Choices' 'Generosity' 'Perceptions_Of_Corruption' 'Ladder_Score_In_Dystopia' 'Explained_By:_Log_Gdp_Per_Capita' 'Explained_By:_Social_Support' 'Explained_By:_Healthy_Life_Expectancy' 'Explained_By:_Freedom_To_Make_Life_Choices' 'Explained_By:_Generosity' 'Explained_By:_Perceptions_Of_Corruption' 'Dystopia_+_Residual'] \n", "\n", "######################################################################################################################################################\n", "\n", "*** historic_df ***\n", "\n", "Old column names: ['Country name' 'year' 'Life Ladder' 'Log GDP per capita' 'Social support' 'Healthy life expectancy at birth' 'Freedom to make life choices' 'Generosity' 'Perceptions of corruption' 'Positive affect' 'Negative affect']\n", "New column names: ['Country_Name' 'Year' 'Life_Ladder' 'Log_Gdp_Per_Capita' 'Social_Support' 'Healthy_Life_Expectancy_At_Birth' 'Freedom_To_Make_Life_Choices' 'Generosity' 'Perceptions_Of_Corruption' 'Positive_Affect' 'Negative_Affect'] \n", "\n", "######################################################################################################################################################\n", "\n" ] } ], "source": [ "def clean_columns(column, elements_to_remove: Union[list, tuple]=(\",\", \".\")):\n", " column = re.sub(r'\\([^)]*\\)', '', column) # Remove all parentheses and their content\n", " for e in elements_to_remove:\n", " column = column.replace(e, \"\") # Remove all commas, periods, etc.\n", " column = column.strip().replace(\" \", \"_\") # Remove any leading and trailing whitespaces\n", " return column.title()\n", "\n", "\n", "for df in dfs:\n", " print(f\"*** {df} ***\\n\")\n", " print(f\"Old column names: {dfs[df].columns.values}\")\n", " dfs[df] = dfs[df].rename(columns=lambda col: clean_columns(col, elements_to_remove=(\"\", \"?\")))\n", " print(f\"New column names: {dfs[df].columns.values}\", separator)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "267f5275-d47f-40bb-8603-639ce83dca7b", "metadata": {}, "source": [ "### Check for duplicated records (rows) in our data frames (if any):" ] }, { "cell_type": "code", "execution_count": 6, "id": "22e2e312-8ac1-4ee4-a814-e657eb1f6187", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "False\n", "False\n", "\n", "\n", "######################################################################################################################################################\n", "\n", "False\n" ] } ], "source": [ "# Check if we have any rows that are identical on every column in both data frames:\n", "for df in dfs:\n", " print(dfs[df].duplicated().any())\n", "\n", "print(separator)\n", "\n", "# Check if we have any rows that are identical on the key columns (\"Country_Name\" and \"Year\") in \"historic_df\":\n", "print(dfs[\"historic_df\"].duplicated(subset=(\"Country_Name\", \"Year\")).any())" ] }, { "attachments": {}, "cell_type": "markdown", "id": "97187473-c43e-414e-a18c-722f0930ccf5", "metadata": {}, "source": [ "### Enrich our dataset in a meaningful way by appropriately combining the two datasets into one:" ] }, { "cell_type": "code", "execution_count": 7, "id": "2c5d4fa1-fbaa-4a22-8df2-f689a4fb5150", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Country_Name Regional_Indicator Year Happiness_Index Logged_Gdp_Per_Capita Social_Support Healthy_Life_Expectancy Freedom_To_Make_Life_Choices Generosity Perceptions_Of_Corruption\n", "0 Afghanistan South Asia 2008 3.724 7.370 0.451 50.800 0.718 0.168 0.882\n", "1 Afghanistan South Asia 2009 4.402 7.540 0.552 51.200 0.679 0.190 0.850\n", "2 Afghanistan South Asia 2010 4.758 7.647 0.539 51.600 0.600 0.121 0.707\n", "3 Afghanistan South Asia 2011 3.832 7.620 0.521 51.920 0.496 0.162 0.731\n", "4 Afghanistan South Asia 2012 3.783 7.705 0.521 52.240 0.531 0.236 0.776\n", "... ... ... ... ... ... ... ... ... ... ...\n", "2030 Zimbabwe Sub-Saharan Africa 2017 3.638 8.016 0.754 55.000 0.753 -0.098 0.751\n", "2031 Zimbabwe Sub-Saharan Africa 2018 3.616 8.049 0.775 55.600 0.763 -0.068 0.844\n", "2032 Zimbabwe Sub-Saharan Africa 2019 2.694 7.950 0.759 56.200 0.632 -0.064 0.831\n", "2033 Zimbabwe Sub-Saharan Africa 2020 3.160 7.829 0.717 56.800 0.643 -0.009 0.789\n", "2034 Zimbabwe Sub-Saharan Africa 2021 3.145 7.943 0.750 56.201 0.677 -0.047 0.821\n", "\n", "[2035 rows x 10 columns]\n" ] } ], "source": [ "# Join the two data frames to include additional columns:\n", "dfs[\"historic_df\"] = dfs[\"current_df\"][[\"Country_Name\", \"Regional_Indicator\"]].merge(dfs[\"historic_df\"], on=\"Country_Name\", how=\"inner\")\n", "\n", "# Create and modify columns:\n", "dfs[\"current_df\"].insert(loc=2, column=\"Year\", value=2021)\n", "dfs[\"current_df\"] = dfs[\"current_df\"].rename(columns={\"Ladder_Score\": \"Happiness_Index\"})\n", "columns = dfs[\"current_df\"].columns\n", "key_columns = np.concatenate((columns[:4], columns[7: 13])) # Grab the desired columns\n", "dfs[\"current_df\"] = dfs[\"current_df\"][key_columns]\n", "\n", "dfs[\"historic_df\"] = dfs[\"historic_df\"].iloc[:, :-2] # Remove the undesired columns\n", "dfs[\"historic_df\"].columns = key_columns # Rename columns to match in both data frames\n", "\n", "# Combine the two data frames:\n", "happiness_df = (pd.concat([dfs[\"current_df\"], dfs[\"historic_df\"]], ignore_index=True)\n", " .sort_values([\"Country_Name\", \"Year\"])\n", " .reset_index(drop=True))\n", "print(happiness_df)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "b51b97d4-0d7b-4a65-a8d2-5da48b8a5116", "metadata": {}, "source": [ "### Fix data frame's column datatypes as necessary:" ] }, { "cell_type": "code", "execution_count": 8, "id": "2919f135-b415-4f4b-8af8-a6c43fd65fa4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Changed column \"Country_Name`s\" datatype to \"category\"\n", "Changed column \"Regional_Indicator`s\" datatype to \"category\"\n" ] } ], "source": [ "def is_int(x):\n", " try:\n", " if np.isnan(x) or int(x):\n", " return True\n", " except:\n", " return False\n", " \n", "\n", "def is_float(x):\n", " try:\n", " float(x)\n", " return True\n", " except:\n", " return False\n", " \n", "\n", "def fix_columns(df):\n", " for column in df.columns:\n", " if any(word in column.lower() for word in ('date', 'time')):\n", " df[column] = df[column].astype('datetime64[ns]')\n", " print(f'Changed column \"{column}\"`s datatype to \"datetime\"')\n", "\n", " elif df[column].dtype in (object, str):\n", " df[column] = df[column].str.strip()\n", "\n", " if np.prod(df[column].value_counts().values) == 1:\n", " print(f'Column \"{column}\" contains meta data!')\n", "\n", " elif all(df[column].apply(is_int)):\n", " df[column] = df[column].astype(pd.Int32Dtype()) # Cannot convert \"nan\" values into \"int\", need \"pd.Int\"\n", " print(f'Changed column \"{column}`s\" datatype to \"int\"')\n", " \n", " else:\n", " temp = df[column].str.replace(',', '.')\n", " if all(temp.apply(is_float)):\n", " df[column] = temp.astype(float)\n", " print(f'Changed column \"{column}`s\" datatype to \"float\"')\n", " else:\n", " df[column] = df[column].astype('category') # More memory and performance efficient\n", " print(f'Changed column \"{column}`s\" datatype to \"category\"')\n", "\n", "\n", "fix_columns(happiness_df)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "3e65f443-9bd1-40be-b0bc-a1a40aa350a6", "metadata": {}, "source": [ "### Take a quick glance at the modified data's statistics:" ] }, { "cell_type": "code", "execution_count": 9, "id": "e4059789-0da2-408f-8f52-b4a71d393365", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 2035 entries, 0 to 2034\n", "Data columns (total 10 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Country_Name 2035 non-null category\n", " 1 Regional_Indicator 2035 non-null category\n", " 2 Year 2035 non-null int64 \n", " 3 Happiness_Index 2035 non-null float64 \n", " 4 Logged_Gdp_Per_Capita 2011 non-null float64 \n", " 5 Social_Support 2026 non-null float64 \n", " 6 Healthy_Life_Expectancy 1984 non-null float64 \n", " 7 Freedom_To_Make_Life_Choices 2005 non-null float64 \n", " 8 Generosity 1959 non-null float64 \n", " 9 Perceptions_Of_Corruption 1931 non-null float64 \n", "dtypes: category(2), float64(7), int64(1)\n", "memory usage: 138.9 KB\n", "None \n", "\n", "######################################################################################################################################################\n", "\n", " Year Happiness_Index Logged_Gdp_Per_Capita Social_Support Healthy_Life_Expectancy Freedom_To_Make_Life_Choices Generosity Perceptions_Of_Corruption\n", "count 2035.000000 2035.000000 2011.000000 2026.000000 1984.000000 2005.000000 1959.000000 1931.000000\n", "mean 2013.826536 5.490948 9.391096 0.814959 63.695212 0.748269 -0.002346 0.746277\n", "std 4.514250 1.107523 1.141129 0.116125 7.376080 0.139289 0.162257 0.186760\n", "min 2005.000000 2.375000 6.635000 0.291000 32.300000 0.258000 -0.335000 0.035000\n", "25% 2010.000000 4.669000 8.484000 0.751000 59.180000 0.656000 -0.117000 0.690000\n", "50% 2014.000000 5.420000 9.487000 0.836000 65.400000 0.769000 -0.029000 0.801000\n", "75% 2018.000000 6.298000 10.370500 0.906750 68.800000 0.861000 0.089000 0.870000\n", "max 2021.000000 8.019000 11.648000 0.987000 77.100000 0.985000 0.698000 0.983000\n" ] } ], "source": [ "print(happiness_df.info(), separator)\n", "print(happiness_df.describe())" ] }, { "attachments": {}, "cell_type": "markdown", "id": "046cd661-6a71-4bf8-9df6-66e680c687a7", "metadata": {}, "source": [ "### Visualize the number of missing values in each column of our data frame:" ] }, { "cell_type": "code", "execution_count": 10, "id": "8304f6bf-e7f6-4555-b113-9f688c611957", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "missing_count = happiness_df.isna().sum()\n", "missing_count = missing_count.reset_index(name=\"Count\").rename(columns={\"index\": \"Column Name\"})\n", "missing_count = missing_count[missing_count[\"Count\"] > 0].sort_values(\"Count\", ascending=False)\n", "\n", "(ggplot(missing_count)\n", "+ geom_bar(aes(x=\"Column Name\", y=\"Count\", fill=\"Column Name\", color=\"Column Name\"), color=\"black\", alpha=0.9, stat=\"identity\")\n", "+ ggtitle(\"Number of missing values in each column\")\n", "+ ggsize(800, 600) \n", "+ theme(legend_title=element_text(size=15), legend_text=element_text(size=13), axis_text_x=element_text(angle=70), panel_grid_major_x='blank') \n", ")\n", "# Notice that the columns of all the missing values are of \"numeric\" type!" ] }, { "attachments": {}, "cell_type": "markdown", "id": "2a319393-03ac-4d27-bb64-781b099e4dc1", "metadata": {}, "source": [ "### Given that \"Perceptions_Of_Corruption\" has the highest number of missing values, visualize which countries most contributed to that:" ] }, { "cell_type": "code", "execution_count": 11, "id": "a519f1a0-9466-4596-a7de-addee2fbf529", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "missing_corruption = (happiness_df.groupby([\"Country_Name\", \"Regional_Indicator\"], observed=True)\n", " .agg({\n", " \"Perceptions_Of_Corruption\": lambda x: x.isna().sum(),\n", " })\n", " ).reset_index()\n", "\n", "missing_corruption = (missing_corruption[missing_corruption[\"Perceptions_Of_Corruption\"] > 0]\n", " .rename(columns={\"Perceptions_Of_Corruption\": \"Missing_Corruption\"})\n", " .sort_values(\"Missing_Corruption\", ascending=False)\n", " .reset_index()\n", " )\n", "\n", "(ggplot(missing_corruption)\n", " + geom_bar(aes(x=\"Country_Name\", y=\"Missing_Corruption\", fill=\"Regional_Indicator\"), color=\"black\", stat=\"identity\",\n", " tooltips=layer_tooltips().line(\"@Country_Name\").line(\"Number of missing values|= ^y\").format(\"^y\", \".1s\"))\n", " +ggtitle('Countries with the highest number of missing \"Corruption\" values grouped by region')\n", " +ggsize(900, 500)\n", " + theme(legend_title=element_text(size=15), legend_text=element_text(size=13), axis_text_x=element_text(angle=85)) \n", ")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "62ceb7a7-b434-4d41-96a6-7a2effa8ca40", "metadata": {}, "source": [ "### Visualize which countries had the highest number of missing values grouped by region:" ] }, { "cell_type": "code", "execution_count": 12, "id": "6d391a78-f559-44c7-a984-c9e993e1dd69", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "missing_per_country = (happiness_df.groupby([\"Country_Name\", \"Regional_Indicator\"], observed=True)\n", " .apply(lambda x: x.isna().sum().sum())\n", " .reset_index(name=\"Count\"))\n", "missing_per_country = missing_per_country[missing_per_country['Count'] > 0].sort_values('Count', ascending=False)\n", "\n", "(ggplot(missing_per_country.head(10))\n", " + geom_bar(aes(x='Count', y='Country_Name', fill=\"Regional_Indicator\"), stat='identity', orientation='y', color='black', tooltips=layer_tooltips().line(\"@Country_Name\").line(\"Number of missing values|= ^x\").format(\"^x\", \".1s\"))\n", " + ggtitle(\"Countries with the highest number of missing values grouped by region\")\n", " + ggsize(900, 500)\n", " +theme(panel_grid_major_y='blank')\n", " )" ] }, { "attachments": {}, "cell_type": "markdown", "id": "5ef43d2c-4d7b-42e5-806d-995c581156fa", "metadata": {}, "source": [ "### Imputation: Fill in the missing values for each column using the appropriate techniques:" ] }, { "cell_type": "code", "execution_count": 13, "id": "9b91f9c2-4931-4e6b-8a62-f88048daecee", "metadata": {}, "outputs": [], "source": [ "# A function that drops missing values from a data frame in a smart way by accounting for \"information lost\"\n", "def smart_dropna(df: pd.DataFrame):\n", " missing = df.isna().any()\n", " if missing.any():\n", " n_cols = len(df.columns) # Represents the fragments/fractions of information in a sample/record\n", " for col in df.columns[missing]: # Get the columns that contain missing values\n", " info_lost_by_col_rmv = df[col].count()/n_cols\n", " info_lost_by_rows_rmv = df[df[col].isna()].notna().sum(axis=1).sum()/n_cols\n", " if info_lost_by_col_rmv < info_lost_by_rows_rmv:\n", " df = df.drop(col, axis=1) # Drop the column\n", " else:\n", " df = df[df[col].notna()].reset_index(drop=True) # Drop all the rows containing missing values across \"col\"\n", " return df" ] }, { "cell_type": "code", "execution_count": 14, "id": "5ac479ab-5143-4103-bff8-e730dee66d0f", "metadata": {}, "outputs": [], "source": [ "class LinearStochasticRegressor(LinearRegression):\n", " def __init__(self, target: str, train_df: pd.DataFrame, target_df: pd.DataFrame):\n", " super().__init__()\n", " self.target_df = target_df\n", " self.train_df = smart_dropna(train_df.copy())\n", " self.target = self.train_df.pop(target).values\n", "\n", " extra_cols = np.setdiff1d(self.target_df.columns, self.train_df.columns)\n", " self.target_df = self.target_df.drop(extra_cols, axis=1).values\n", " self.train_df = self.train_df.values\n", "\n", " def fit_data(self):\n", " self.fit(self.train_df, self.target)\n", "\n", " def get_r2_score(self):\n", " return self.score(self.train_df, self.target)\n", " \n", " def add_random_error(self):\n", " # Residual sum of squares (Residual Variance):\n", " rrs = mean_squared_error(self.target, self.predict(self.train_df))\n", " std_dev = np.sqrt(rrs)\n", " return np.random.normal(0, std_dev, size=len(self.target_df))\n", " \n", " def predict_missing(self):\n", " return self.predict(self.target_df) + self.add_random_error()\n", "\n", "\n", "class TreeStochasticRegressor(HistGradientBoostingRegressor):\n", " def __init__(self, target: str, train_df: pd.DataFrame, target_df: pd.DataFrame):\n", " super().__init__(learning_rate=0.1, max_depth=5,)\n", " self.target_df = target_df.values\n", " self.train_df = train_df\n", " self.target = self.train_df.pop(target).values\n", " self.train_df = self.train_df.values\n", "\n", " def fit_data(self):\n", " self.fit(self.train_df, self.target)\n", "\n", " def get_r2_score(self):\n", " return self.score(self.train_df, self.target)\n", " \n", " def add_random_error(self):\n", " # Residual sum of squares (Residual Variance):\n", " rrs = mean_squared_error(self.target, self.predict(self.train_df))\n", " std_dev = np.sqrt(rrs)\n", " return np.random.normal(0, std_dev, size=len(self.target_df))\n", " \n", " def predict_missing(self):\n", " return self.predict(self.target_df) + self.add_random_error()" ] }, { "cell_type": "code", "execution_count": 15, "id": "40734056-2e67-40d8-a610-e743b6bd5b0a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Column \"Perceptions_Of_Corruption\"`s values are almost completely missing for country: \"China\"\n", "Column \"Healthy_Life_Expectancy\"`s values are almost completely missing for country: \"Hong Kong S.A.R. of China\"\n", "Column \"Healthy_Life_Expectancy\"`s values are almost completely missing for country: \"Kosovo\"\n", "Column \"Perceptions_Of_Corruption\"`s values are almost completely missing for country: \"Turkmenistan\"\n", "\n", "\n", "######################################################################################################################################################\n", "\n", "Does our data frame contain any missing values? False\n" ] } ], "source": [ "# First, we assume that the data statistics are most accurate at the \"Country_Name\" granularity level\n", "# Hence, we'll utilize the low-level, in-distribution \"Country_Name\" data statistics for imputation when applicable; otherwise, we'll rely on the high-level, out-of-distribution data statistics based on the \"Regional_Indicator\" and \"Year\" granularity levels\n", "\n", "hopeless_TH = 0.9 # Threshold at which we can't sensibly fill in the missing values in a meaningful way without introducing any bias\n", "guess_TH = 0.7 # Threshold at which the missing values are hard (inaccurate) to predict (rely on high-level statistics)\n", "# In between are the thresholds at which the missing values can be predicted using farily complex predictive models \n", "estimate_TH = 0.25 # Threshold at which the missing values are simple to estimate/approximate (linearlly: utilize low-level statistics)\n", "\n", "grouped_df = happiness_df.groupby([\"Country_Name\", \"Regional_Indicator\"], observed=True)\n", "\n", "order = ('hopeless', 'estimate', 'guess', 'predict') # Order at which the imputing mechanism is executed\n", "\n", "for step in order:\n", " for (country_name, country_region), country_group in grouped_df:\n", " country_group = country_group.select_dtypes('number')\n", " missing = country_group.isna().sum()\n", " if missing.any():\n", " missing_ratio = missing[missing > 0]/len(country_group)\n", " region_df = happiness_df[happiness_df[\"Regional_Indicator\"] == country_region].select_dtypes('number')\n", "\n", " if step == 'hopeless':\n", " hopeless_cols = missing_ratio[missing_ratio >= hopeless_TH].index\n", " for col in hopeless_cols:\n", " print(f'Column \"{col}\"`s values are almost completely missing for country: \"{country_name}\"')\n", "\n", " elif step == 'estimate':\n", " estimate_cols = missing_ratio[missing_ratio <= estimate_TH].index\n", " for col in estimate_cols:\n", " happiness_df.loc[happiness_df[\"Country_Name\"] == country_name, col] = np.round(country_group\n", " .set_index(\"Year\")[col]\n", " .interpolate(method=\"index\")\n", " .bfill()\n", " .values, 3)\n", " \n", " elif step == 'guess':\n", " guess_cols = missing_ratio[missing_ratio >= guess_TH].index\n", " for col in guess_cols:\n", " missing = country_group[col].isna() # Boolean mask where rows contain missing values on \"col\"\n", " missing_years = country_group.loc[missing, \"Year\"]\n", " region_by_year = region_df.loc[region_df[\"Year\"].isin(missing_years), [\"Year\", col]].groupby(\"Year\")\n", " happiness_df.loc[(happiness_df[\"Country_Name\"] == country_name) & missing, col] = np.round(region_by_year\n", " .mean()\n", " .values, 3) # Or median\n", " \n", " elif step == 'predict':\n", " # Train a predictive hypothesis function (model) to predict missing values:\n", " predict_cols = missing_ratio[(estimate_TH < missing_ratio) & (missing_ratio < guess_TH)].index\n", " for col in predict_cols:\n", " missing = country_group[col].isna() # Boolean mask where rows contain missing values on \"col\"\n", " target_df = country_group[missing].drop(col, axis=1)\n", " train_df = country_group[~missing]\n", "\n", " if len(smart_dropna(train_df)) < 5: # Not enough statistics to use in-distribution data\n", " missing_years = country_group.loc[missing, \"Year\"]\n", " region_by_year_df = region_df[region_df[\"Year\"].isin(missing_years) & region_df[col].notna()]\n", " params = dict(target=col, train_df=region_by_year_df, target_df=target_df)\n", " else:\n", " params = dict(target=col, train_df=train_df, target_df=target_df)\n", "\n", " if target_df.isna().any().any():\n", " model_name = \"TSR\"\n", " model = TreeStochasticRegressor(**params)\n", " else:\n", " model_name = \"LSR\"\n", " model = LinearStochasticRegressor(**params)\n", "\n", " model.fit_data()\n", " if model_name == \"LSR\":\n", " # If no robust linear correlation or if overfitting:\n", " if model.get_r2_score() < 0.8 or model.get_r2_score() == 1:\n", " model = TreeStochasticRegressor(**params)\n", " model.fit_data()\n", " happiness_df.loc[(happiness_df[\"Country_Name\"] == country_name) & missing, col] = np.round(model.predict_missing(), 3)\n", "\n", "# Make sure we no longer have any missing values in our data frame:\n", "print(separator)\n", "print(f'Does our data frame contain any missing values? {happiness_df.isna().any().any()}')" ] }, { "attachments": {}, "cell_type": "markdown", "id": "089ee331-fc38-41dd-a6c2-8c3beb3aa25d", "metadata": {}, "source": [ "\n", "## Cluster Our Data Frame Based on The Happiness_Index to Discover Underlying (latent) Happiness Levels (clusters)" ] }, { "cell_type": "code", "execution_count": 16, "id": "354f7331-28ec-4515-aa19-afde82dd3b8a", "metadata": {}, "outputs": [], "source": [ "# Cluster our samples based on the \"Happiness_Index\" column:\n", "kmeans = KMeans(n_clusters=5, init='k-means++', n_init=15, max_iter=500)\n", "cluster_name = ['Sad', 'Somewhat Sad', 'Neurtal', 'Somewhat Happy', 'Happy']\n", "\n", "clusters = kmeans.fit_predict(happiness_df[[\"Happiness_Index\"]])\n", "cluster_order = np.argsort(kmeans.cluster_centers_[:, 0])\n", "cluster_mapping = {c_order: c_name for (c_order, c_name) in zip(cluster_order, cluster_name)}\n", "happiness_df['Happiness_Level'] = pd.Series([cluster_mapping[c] for c in clusters]).astype('category')" ] }, { "attachments": {}, "cell_type": "markdown", "id": "750e2d10-7ea8-4e8f-b112-6079d66b7f7f", "metadata": {}, "source": [ "\n", "## Feature (Column) Understanding" ] }, { "cell_type": "markdown", "id": "c9996f5d-e988-4b8d-abfa-1d8239159f63", "metadata": {}, "source": [ "### Generate custom interactive visualizations" ] }, { "cell_type": "code", "execution_count": 48, "id": "8189331b-708c-4379-8991-cc4c6e2d5a91", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pyg.walk(happiness_df, hideDataSourceConfig=True)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "64f4d461-3ed7-445f-9041-98e46da5aabf", "metadata": {}, "source": [ "### Perform univariate feature analysis:" ] }, { "cell_type": "code", "execution_count": 17, "id": "7caf1524-adde-4af7-8072-f5de8c7b81a0", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/mercury+json": "{\n \"widget\": \"Select\",\n \"value\": \"Perceptions_Of_Corruption\",\n \"choices\": [\n \"Country_Name\",\n \"Regional_Indicator\",\n \"Year\",\n \"Happiness_Index\",\n \"Logged_Gdp_Per_Capita\",\n \"Social_Support\",\n \"Healthy_Life_Expectancy\",\n \"Freedom_To_Make_Life_Choices\",\n \"Generosity\",\n \"Perceptions_Of_Corruption\",\n \"Happiness_Level\"\n ],\n \"label\": \"Select a column to investigte\",\n \"model_id\": \"0a1d3aa881cb4becb9ecaff3cf0e0274\",\n \"code_uid\": \"Select.0.40.16.1-randaa8f5bdb\",\n \"url_key\": \"\",\n \"disabled\": false,\n \"hidden\": false\n}", "application/vnd.jupyter.widget-view+json": { "model_id": "0a1d3aa881cb4becb9ecaff3cf0e0274", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Select" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "column_select = mr.Select(value=\"Perceptions_Of_Corruption\", choices=happiness_df.columns, label=\"Select a column to investigte\")" ] }, { "cell_type": "code", "execution_count": 18, "id": "819edc60-b6fc-4020-bca3-2062bc683895", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kde_plot = sns.kdeplot(x=column_select.value, data=happiness_df, ax=None)\n", "plt.close()\n", "y_values = kde_plot.lines[0].get_ydata()\n", "x_values = kde_plot.lines[0].get_xdata()\n", "\n", "max_indx = np.argmax(y_values)\n", "max_prob = x_values[max_indx]\n", "mean = happiness_df[column_select.value].mean()\n", "diff = max_prob - mean\n", "if abs(diff) < happiness_df[column_select.value].std()/2.5:\n", " dist = \"Nomarl\"\n", "elif diff > 0:\n", " dist = \"Left-Skewed\"\n", "else:\n", " dist = \"Right-Skewed\"\n", "\n", "(ggplot(happiness_df)\n", " + geom_density(aes(x=column_select.value, y=\"..density..\", fill=column_select.value), stat=\"density\", quantile_lines=True, color=\"gray\")\n", " +ggtitle(f'Historic distribution ({dist}) of {column_select.value} across all countries')\n", " +geom_vline(\n", " xintercept=mean,\n", " color=\"red\", linetype=\"dashed\", size=1)\n", " + geom_text(\n", " x=mean,\n", " y=np.max(y_values)/50,\n", " label='(mean)',\n", " color=\"#ff7f0e\",\n", " size=9,\n", " )\n", " +theme(panel_grid_major_x='blank')\n", " +ggsize(900, 500)\n", ")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "e462b989-1889-4abc-adc9-b838b7529b73", "metadata": {}, "source": [ "### Breakdown how the countries from different happiness levels make up that column's distribution: " ] }, { "cell_type": "code", "execution_count": 19, "id": "c7e23457-ecc6-4473-b960-249c5b2cdc0c", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(ggplot(happiness_df)\n", "+geom_density(aes(x=column_select.value, y=\"..density..\", fill=\"Happiness_Level\"), color=\"black\", position=\"dodge\", stat=\"density\")\n", " +labs(y=\"Density\", title=f\"Distribution of {column_select.value} grouped by Happines Levels\")\n", " +ggsize(900, 500)\n", " +theme(panel_grid_major_x='blank')\n", ")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "358fc571-5260-4159-aac3-23db63a5e4cb", "metadata": {}, "source": [ "### Further deepdive into that column's statistics:" ] }, { "cell_type": "code", "execution_count": 20, "id": "13ada20c-4acf-4c25-ba93-0623394538eb", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(ggplot(happiness_df, aes(x='Happiness_Level', y=column_select.value))\n", " + geom_violin(aes(color='Happiness_Level', fill='Happiness_Level'), size=2, alpha=.5, scale='width',\n", " tooltips=layer_tooltips().line(f\"{column_select.value}:|^y\"))\n", " + geom_boxplot(aes(fill='Happiness_Level'), width=.2)\n", " + ggsize(900, 500))" ] }, { "cell_type": "code", "execution_count": 21, "id": "b8e10d10-d990-465f-9680-5e6e7043c672", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(ggplot(happiness_df)\n", "+ geom_bar(aes(x=\"Happiness_Level\", y=\"..count..\", fill=\"Regional_Indicator\"), color=\"black\",\n", " tooltips=layer_tooltips().line(\"^y\"))\n", "+ ggtitle(\"Chart\") \n", " + ggsize(900, 500)\n", " +theme(panel_grid_major_x='blank')\n", " +labs(title=\"Number of countries in each happiness level grouped by region\", y=\"Count\"))" ] }, { "cell_type": "code", "execution_count": 22, "id": "02ff4500-00b7-4e9c-8e7d-6431f0a0913a", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/mercury+json": "{\n \"widget\": \"Select\",\n \"value\": \"Happiness_Index\",\n \"choices\": [\n \"Country_Name\",\n \"Regional_Indicator\",\n \"Year\",\n \"Happiness_Index\",\n \"Logged_Gdp_Per_Capita\",\n \"Social_Support\",\n \"Healthy_Life_Expectancy\",\n \"Freedom_To_Make_Life_Choices\",\n \"Generosity\",\n \"Perceptions_Of_Corruption\",\n \"Happiness_Level\"\n ],\n \"label\": \"Select a column to investigte\",\n \"model_id\": \"d3bfb5818b7b4e4ab9d39b0c095f99ca\",\n \"code_uid\": \"Select.0.40.16.1-randc8dad19f\",\n \"url_key\": \"\",\n \"disabled\": false,\n \"hidden\": false\n}", "application/vnd.jupyter.widget-view+json": { "model_id": "d3bfb5818b7b4e4ab9d39b0c095f99ca", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Select" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "column_select = mr.Select(value=\"Happiness_Index\", choices=happiness_df.columns, label=\"Select a column to investigte\")" ] }, { "cell_type": "code", "execution_count": 23, "id": "6dd946dd-b589-41f8-9651-ba791e707d9c", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "means = happiness_df.groupby(\"Regional_Indicator\", observed=True)[column_select.value].mean().reset_index()\n", "(ggplot()\n", "+geom_point(aes(x=\"Regional_Indicator\", y=column_select.value, color=\"Happiness_Level\"), data=happiness_df, shape=1, size=2.5)\n", "+geom_point(aes(x=\"Regional_Indicator\", y=column_select.value, size=column_select.value,), fill=\"orange\", color=\"black\", data=means, shape=23)\n", " +ggsize(900, 500)\n", " + scale_size(range=[3, 7])\n", " +theme(axis_text_x=element_text(angle=85))\n", " + ggtitle(f\"Historic distribution of the {column_select.value} in each region (yellow diamond: mean)\")\n", ")" ] }, { "cell_type": "code", "execution_count": 24, "id": "a2ca2ff6-a677-4c18-a5d9-3cc138b6d1a7", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "means = happiness_df.groupby(\"Year\", observed=True)[column_select.value].mean().reset_index()\n", "years = np.sort(happiness_df[\"Year\"].unique())\n", "(ggplot()\n", "+geom_point(aes(x=\"Year\", y=column_select.value, fill=column_select.value), shape=21, color=\"black\", data=happiness_df)\n", "+geom_point(aes(x=\"Year\", y=column_select.value, size=column_select.value,), fill=\"blue\", color=\"black\", data=means, shape=23)\n", " +ggsize(900, 500)\n", " + scale_fill_gradient2(midpoint=means[column_select.value].mean(), low='red', mid='yellow', high='green', guide=guide_colorbar(nbin=3, barwidth=10))\n", " + scale_x_continuous(breaks=years)\n", " + scale_size(range=[3, 10], guide='none')\n", " +theme(axis_text_x=element_text(angle=30))\n", "+ggtitle(f\"Distribution of the overall {column_select.value} across the years (blue diamond: mean)\") )" ] }, { "cell_type": "code", "execution_count": 25, "id": "dc2acb71-23cf-4070-9304-edac48933dbc", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/mercury+json": "{\n \"widget\": \"Select\",\n \"value\": 2021,\n \"choices\": [\n 2005,\n 2006,\n 2007,\n 2008,\n 2009,\n 2010,\n 2011,\n 2012,\n 2013,\n 2014,\n 2015,\n 2016,\n 2017,\n 2018,\n 2019,\n 2020,\n 2021\n ],\n \"label\": \"Select year of interest\",\n \"model_id\": \"c2c7877e59554237a1e6b1827b328f5f\",\n \"code_uid\": \"Select.0.40.16.2-randddcdc7c1\",\n \"url_key\": \"\",\n \"disabled\": false,\n \"hidden\": false\n}", "application/vnd.jupyter.widget-view+json": { "model_id": "c2c7877e59554237a1e6b1827b328f5f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Select" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "years = [int(y) for y in years]\n", "year_select = mr.Select(value=2021, choices=years, label=\"Select year of interest\")" ] }, { "cell_type": "code", "execution_count": 26, "id": "e3c2819a-09f7-4c30-9014-03503973f0dd", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "happiness_sorted = happiness_df.loc[happiness_df[\"Year\"] == year_select.value, [\"Happiness_Index\", \"Country_Name\", \"Regional_Indicator\"]].sort_values(\"Happiness_Index\", ascending=False)\n", "happiest = happiness_sorted.head(5).copy()\n", "happiest[\"Feeling\"] = \"Happiest\"\n", "saddest = happiness_sorted.tail(5).copy()\n", "saddest[\"Feeling\"] = \"Saddest\"\n", "\n", "combined_df = pd.concat([happiest, saddest], ignore_index=True)\n", "\n", "(ggplot(combined_df)\n", " + geom_bar(aes(x=\"Country_Name\", y=\"Happiness_Index\", fill='Regional_Indicator', color=\"Feeling\"), size=1.5, stat='identity', width=0.7, tooltips=layer_tooltips().line(\"Feeling:|@Feeling\"))\n", " +ggsize(900, 500)\n", " +ggtitle(f\"Top 5 happiest & saddest countries in {year_select.value} grouped by region\")\n", " +theme(panel_grid_major_x='blank')\n", " +scale_color_manual([\"green\", \"red\"])\n", ")" ] }, { "cell_type": "code", "execution_count": 27, "id": "55ab9d5c-f67a-4bbf-9266-f926611994bd", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/mercury+json": "{\n \"widget\": \"Select\",\n \"value\": \"Happiness_Index\",\n \"choices\": [\n \"Country_Name\",\n \"Regional_Indicator\",\n \"Year\",\n \"Happiness_Index\",\n \"Logged_Gdp_Per_Capita\",\n \"Social_Support\",\n \"Healthy_Life_Expectancy\",\n \"Freedom_To_Make_Life_Choices\",\n \"Generosity\",\n \"Perceptions_Of_Corruption\",\n \"Happiness_Level\"\n ],\n \"label\": \"Select column of interest\",\n \"model_id\": \"07c52293be694f6cb2ecfcfc2da45878\",\n \"code_uid\": \"Select.0.40.16.1-rand16d72421\",\n \"url_key\": \"\",\n \"disabled\": false,\n \"hidden\": false\n}", "application/vnd.jupyter.widget-view+json": { "model_id": "07c52293be694f6cb2ecfcfc2da45878", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Select" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "column_select = mr.Select(value=\"Happiness_Index\", choices=happiness_df.columns, label=\"Select column of interest\")" ] }, { "cell_type": "code", "execution_count": 28, "id": "759ef827-e0b5-45cd-bddb-780fc90b8d26", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "col_change = (happiness_df[[\"Year\", \"Country_Name\", column_select.value, \"Regional_Indicator\"]]\n", " .groupby([\"Country_Name\", \"Regional_Indicator\"], observed=True)[column_select.value]\n", " .pct_change() * 100)\n", "col_change = col_change.rename(\"Percent_Change\")\n", "col_change = pd.concat([happiness_df, col_change], axis=1)\n", "\n", "col_change_by_year = col_change.sort_values(\"Year\").groupby(\"Year\", observed=True)\n", "max_col_change_by_year = col_change_by_year[\"Percent_Change\"].idxmax()\n", "\n", "max_country_by_year = col_change.loc[max_col_change_by_year.dropna()]\n", "max_country_by_year[\"Year_Country\"] = max_country_by_year[\"Year\"].astype(str) + \":\\n\" + max_country_by_year[\"Country_Name\"].astype(str)\n", "\n", "(\n", "ggplot(max_country_by_year)\n", "+ geom_bar(aes(x=\"Year_Country\", y=\"Percent_Change\", fill=\"Percent_Change\"), stat='identity', color=\"black\",\n", " tooltips=layer_tooltips().line(\"@Country_Name\").line(f\"Percent Change in {column_select.value}|= ^y%\"))\n", "+ scale_fill_gradient2(low='red', mid='white', high='darkgreen', midpoint=0)\n", "+ ylim(max_country_by_year[\"Percent_Change\"].min()-0.5, max_country_by_year[\"Percent_Change\"].max()+0.5)\n", " +geom_hline(\n", " yintercept=0,\n", " color=\"pink\", size=1)\n", "+ ggsize(900, 500)\n", "+ ggtitle(f\"YoY maximum percent change in {column_select.value} observed in each year\")\n", " +theme(axis_text_x=element_text(angle=70), panel_grid_major_x='blank')\n", ")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "24f929a7-55ee-4dc7-bf48-9c34afe5955f", "metadata": {}, "source": [ "### Perform Bivariate feature analysis:" ] }, { "cell_type": "code", "execution_count": 29, "id": "694ebe1e-1aad-4ce4-9bc6-73967a72e19a", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(corr_plot(happiness_df.select_dtypes('number'))\n", " .points(type=\"full\")\n", " .labels(type=\"full\").build() + ggtitle(\"Tiles, points and labels\")\n", " +ggsize(900, 600)\n", " +ggtitle(\"Overall Correlation Heatmap\"))\n", "\n", "# Very surprising to see that there is no overall correlation at all between \"Generosity\" and \"Gdp_Per_Caipta\"!" ] }, { "cell_type": "code", "execution_count": 30, "id": "a99c153b-ee0b-4452-9ecc-7d5410e416fa", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/mercury+json": "{\n \"widget\": \"Select\",\n \"value\": \"United Arab Emirates\",\n \"choices\": [\n \"Afghanistan\",\n \"Albania\",\n \"Algeria\",\n \"Argentina\",\n \"Armenia\",\n \"Australia\",\n \"Austria\",\n \"Azerbaijan\",\n \"Bahrain\",\n \"Bangladesh\",\n \"Belarus\",\n \"Belgium\",\n \"Benin\",\n \"Bolivia\",\n \"Bosnia and Herzegovina\",\n \"Botswana\",\n \"Brazil\",\n \"Bulgaria\",\n \"Burkina Faso\",\n \"Burundi\",\n \"Cambodia\",\n \"Cameroon\",\n \"Canada\",\n \"Chad\",\n \"Chile\",\n \"China\",\n \"Colombia\",\n \"Comoros\",\n \"Congo (Brazzaville)\",\n \"Costa Rica\",\n \"Croatia\",\n \"Cyprus\",\n \"Czech Republic\",\n \"Denmark\",\n \"Dominican Republic\",\n \"Ecuador\",\n \"Egypt\",\n \"El Salvador\",\n \"Estonia\",\n \"Ethiopia\",\n \"Finland\",\n \"France\",\n \"Gabon\",\n \"Gambia\",\n \"Georgia\",\n \"Germany\",\n \"Ghana\",\n \"Greece\",\n \"Guatemala\",\n \"Guinea\",\n \"Haiti\",\n \"Honduras\",\n \"Hong Kong S.A.R. of China\",\n \"Hungary\",\n \"Iceland\",\n \"India\",\n \"Indonesia\",\n \"Iran\",\n \"Iraq\",\n \"Ireland\",\n \"Israel\",\n \"Italy\",\n \"Ivory Coast\",\n \"Jamaica\",\n \"Japan\",\n \"Jordan\",\n \"Kazakhstan\",\n \"Kenya\",\n \"Kosovo\",\n \"Kuwait\",\n \"Kyrgyzstan\",\n \"Laos\",\n \"Latvia\",\n \"Lebanon\",\n \"Lesotho\",\n \"Liberia\",\n \"Libya\",\n \"Lithuania\",\n \"Luxembourg\",\n \"Madagascar\",\n \"Malawi\",\n \"Malaysia\",\n \"Maldives\",\n \"Mali\",\n \"Malta\",\n \"Mauritania\",\n \"Mauritius\",\n \"Mexico\",\n \"Moldova\",\n \"Mongolia\",\n \"Montenegro\",\n \"Morocco\",\n \"Mozambique\",\n \"Myanmar\",\n \"Namibia\",\n \"Nepal\",\n \"Netherlands\",\n \"New Zealand\",\n \"Nicaragua\",\n \"Niger\",\n \"Nigeria\",\n \"North Cyprus\",\n \"North Macedonia\",\n \"Norway\",\n \"Pakistan\",\n \"Palestinian Territories\",\n \"Panama\",\n \"Paraguay\",\n \"Peru\",\n \"Philippines\",\n \"Poland\",\n \"Portugal\",\n \"Romania\",\n \"Russia\",\n \"Rwanda\",\n \"Saudi Arabia\",\n \"Senegal\",\n \"Serbia\",\n \"Sierra Leone\",\n \"Singapore\",\n \"Slovakia\",\n \"Slovenia\",\n \"South Africa\",\n \"South Korea\",\n \"Spain\",\n \"Sri Lanka\",\n \"Swaziland\",\n \"Sweden\",\n \"Switzerland\",\n \"Taiwan Province of China\",\n \"Tajikistan\",\n \"Tanzania\",\n \"Thailand\",\n \"Togo\",\n \"Tunisia\",\n \"Turkey\",\n \"Turkmenistan\",\n \"Uganda\",\n \"Ukraine\",\n \"United Arab Emirates\",\n \"United Kingdom\",\n \"United States\",\n \"Uruguay\",\n \"Uzbekistan\",\n \"Venezuela\",\n \"Vietnam\",\n \"Yemen\",\n \"Zambia\",\n \"Zimbabwe\"\n ],\n \"label\": \"Select country of interest\",\n \"model_id\": \"d4e45183e0e9442bbfcce3e5bbc883f9\",\n \"code_uid\": \"Select.0.40.16.1-rand7ff3255b\",\n \"url_key\": \"\",\n \"disabled\": false,\n \"hidden\": false\n}", "application/vnd.jupyter.widget-view+json": { "model_id": "d4e45183e0e9442bbfcce3e5bbc883f9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Select" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "country_select = mr.Select(value=\"United Arab Emirates\", choices=happiness_df[\"Country_Name\"].unique(), label=\"Select country of interest\")" ] }, { "cell_type": "code", "execution_count": 31, "id": "80972f1b-57d4-4c91-9f25-81c93cf34fd9", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(corr_plot(happiness_df[happiness_df[\"Country_Name\"] == country_select.value].select_dtypes('number'))\n", " .points(type=\"full\")\n", " .labels(type=\"full\").palette_BrBG().build() + ggtitle(\"Tiles, points and labels\")\n", " +ggsize(900, 600)\n", " +ggtitle(f\"{country_select.value}'s Correlation Heatmap\"))\n" ] }, { "cell_type": "code", "execution_count": 32, "id": "8d54fc1f-69fa-4464-a6cc-f2c947498bf8", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/mercury+json": "{\n \"widget\": \"MultiSelect\",\n \"value\": [\n \"Logged_Gdp_Per_Capita\",\n \"Happiness_Index\"\n ],\n \"choices\": [\n \"Country_Name\",\n \"Regional_Indicator\",\n \"Year\",\n \"Happiness_Index\",\n \"Logged_Gdp_Per_Capita\",\n \"Social_Support\",\n \"Healthy_Life_Expectancy\",\n \"Freedom_To_Make_Life_Choices\",\n \"Generosity\",\n \"Perceptions_Of_Corruption\",\n \"Happiness_Level\"\n ],\n \"label\": \"Select two features to investigate\",\n \"model_id\": \"0b6c47536f354dad9badceffe5f91097\",\n \"code_uid\": \"MultiSelect.0.40.16.1-rand4f18112e\",\n \"url_key\": \"\",\n \"disabled\": false,\n \"hidden\": false\n}", "application/vnd.jupyter.widget-view+json": { "model_id": "0b6c47536f354dad9badceffe5f91097", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.MultiSelect" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "columns_select = mr.MultiSelect(label=\"Select two features to investigate\", \n", " value=[\"Logged_Gdp_Per_Capita\", \"Happiness_Index\"], \n", " choices=happiness_df.columns)" ] }, { "cell_type": "code", "execution_count": 33, "id": "1ff7a3a4-73aa-49af-93ce-a0de90330c3c", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns_pair = columns_select.value\n", "\n", "assert len(columns_pair) == 2, f\"Please select exactly two columns, you've selected {len(columns_pair)}!\"\n", "\n", "(joint_plot(happiness_df, x=columns_pair[0], y=columns_pair[1], color_by=\"Happiness_Level\")\n", " + facet_wrap(\"Regional_Indicator\", nrow=5, scales=\"free\")\n", " + geom_smooth(color=\"gray\")\n", " + ggsize(1000, 1200)\n", " + ggtitle(f\"Overall Trend in {columns_pair[1]} Against {columns_pair[0]} Across Every Region Grouped by Happiness Levels\")\n", " + labs(caption=f\"This figure illustrates how the {columns_pair[1]} of the different happiness groups within each region reacts to changes in {columns_pair[0]}\")\n", " +theme(legend_position=\"top\", axis_title=element_text(margin=margin(t=20, b=30,l=30)))\n", ")" ] }, { "cell_type": "code", "execution_count": 34, "id": "3f794869-fbac-4aa5-b1f0-37aff4d3d5e4", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/mercury+json": "{\n \"widget\": \"MultiSelect\",\n \"value\": [\n \"Perceptions_Of_Corruption\",\n \"Happiness_Index\"\n ],\n \"choices\": [\n \"Country_Name\",\n \"Regional_Indicator\",\n \"Year\",\n \"Happiness_Index\",\n \"Logged_Gdp_Per_Capita\",\n \"Social_Support\",\n \"Healthy_Life_Expectancy\",\n \"Freedom_To_Make_Life_Choices\",\n \"Generosity\",\n \"Perceptions_Of_Corruption\",\n \"Happiness_Level\"\n ],\n \"label\": \"Select two features to investigate\",\n \"model_id\": \"e66592047e5e4afe9ca8cf16b1c51478\",\n \"code_uid\": \"MultiSelect.0.40.16.1-rand978b1e43\",\n \"url_key\": \"\",\n \"disabled\": false,\n \"hidden\": false\n}", "application/vnd.jupyter.widget-view+json": { "model_id": "e66592047e5e4afe9ca8cf16b1c51478", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.MultiSelect" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "columns_select = mr.MultiSelect(label=\"Select two features to investigate\", \n", " value=[\"Perceptions_Of_Corruption\", \"Happiness_Index\"], \n", " choices=happiness_df.columns)" ] }, { "cell_type": "code", "execution_count": 35, "id": "6e47ca06-59fa-4a1e-9de4-658237919f9c", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns_pair = columns_select.value\n", "\n", "assert len(columns_pair) == 2, f\"Please select exactly two columns, you've selected {len(columns_pair)}!\"\n", "\n", "(ggplot(happiness_df)\n", "+ geom_smooth(aes(x=columns_pair[0], y=columns_pair[1], color=\"Happiness_Level\"), size=1.3)\n", "+ ggsize(900, 500)\n", " + ggtitle(f\"How The {columns_pair[1]} varies against {columns_pair[0]} grouped by Happiness Levels worldwide\")\n", ")" ] }, { "cell_type": "code", "execution_count": 36, "id": "dc02839b-0efa-466e-8a62-32c2537426ab", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/mercury+json": "{\n \"widget\": \"Select\",\n \"value\": \"Happiness_Index\",\n \"choices\": [\n \"Country_Name\",\n \"Regional_Indicator\",\n \"Year\",\n \"Happiness_Index\",\n \"Logged_Gdp_Per_Capita\",\n \"Social_Support\",\n \"Healthy_Life_Expectancy\",\n \"Freedom_To_Make_Life_Choices\",\n \"Generosity\",\n \"Perceptions_Of_Corruption\",\n \"Happiness_Level\"\n ],\n \"label\": \"Select column to view trend overtime\",\n \"model_id\": \"3509952df688414ea9fa42f20e01132e\",\n \"code_uid\": \"Select.0.40.16.1-randb1597ea7\",\n \"url_key\": \"\",\n \"disabled\": false,\n \"hidden\": false\n}", "application/vnd.jupyter.widget-view+json": { "model_id": "3509952df688414ea9fa42f20e01132e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Select" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "column_select = mr.Select(value=\"Happiness_Index\", choices=happiness_df.columns, label=\"Select column to view trend overtime\")" ] }, { "cell_type": "code", "execution_count": 37, "id": "0df557cc-0c11-4586-88d6-25d489516368", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "region_happiness = happiness_df.groupby([\"Regional_Indicator\", \"Year\"]).agg({\n", " column_select.value: \"mean\"\n", "}).reset_index()\n", "\n", "(ggplot(region_happiness)\n", " +geom_line(aes(x=\"Year\", y=column_select.value, color=\"Regional_Indicator\"), size=3,\n", " tooltips=layer_tooltips().line(f\"{column_select.value}: ^y\"))\n", " + facet_wrap(\"Regional_Indicator\", nrow=5, scales=\"free\")\n", " + ggtitle(f\"Trend of average {column_select.value} over the years accross every region\")\n", " +theme(legend_position=\"bottom\")\n", " +scale_color_discrete(guide=guide_legend(ncol=2))\n", "+ scale_x_continuous(breaks=list(range(2005, 2022, 5)) + [2021])\n", " + ggsize(1000, 1000)\n", ")" ] }, { "cell_type": "code", "execution_count": 38, "id": "3344b01d-13a2-4754-bbb0-e660d5caf2f2", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/mercury+json": "{\n \"widget\": \"MultiSelect\",\n \"value\": [\n \"United Arab Emirates\",\n \"Egypt\",\n \"Finland\",\n \"United States\",\n \"Russia\"\n ],\n \"choices\": [\n \"Afghanistan\",\n \"Albania\",\n \"Algeria\",\n \"Argentina\",\n \"Armenia\",\n \"Australia\",\n \"Austria\",\n \"Azerbaijan\",\n \"Bahrain\",\n \"Bangladesh\",\n \"Belarus\",\n \"Belgium\",\n \"Benin\",\n \"Bolivia\",\n \"Bosnia and Herzegovina\",\n \"Botswana\",\n \"Brazil\",\n \"Bulgaria\",\n \"Burkina Faso\",\n \"Burundi\",\n \"Cambodia\",\n \"Cameroon\",\n \"Canada\",\n \"Chad\",\n \"Chile\",\n \"China\",\n \"Colombia\",\n \"Comoros\",\n \"Congo (Brazzaville)\",\n \"Costa Rica\",\n \"Croatia\",\n \"Cyprus\",\n \"Czech Republic\",\n \"Denmark\",\n \"Dominican Republic\",\n \"Ecuador\",\n \"Egypt\",\n \"El Salvador\",\n \"Estonia\",\n \"Ethiopia\",\n \"Finland\",\n \"France\",\n \"Gabon\",\n \"Gambia\",\n \"Georgia\",\n \"Germany\",\n \"Ghana\",\n \"Greece\",\n \"Guatemala\",\n \"Guinea\",\n \"Haiti\",\n \"Honduras\",\n \"Hong Kong S.A.R. of China\",\n \"Hungary\",\n \"Iceland\",\n \"India\",\n \"Indonesia\",\n \"Iran\",\n \"Iraq\",\n \"Ireland\",\n \"Israel\",\n \"Italy\",\n \"Ivory Coast\",\n \"Jamaica\",\n \"Japan\",\n \"Jordan\",\n \"Kazakhstan\",\n \"Kenya\",\n \"Kosovo\",\n \"Kuwait\",\n \"Kyrgyzstan\",\n \"Laos\",\n \"Latvia\",\n \"Lebanon\",\n \"Lesotho\",\n \"Liberia\",\n \"Libya\",\n \"Lithuania\",\n \"Luxembourg\",\n \"Madagascar\",\n \"Malawi\",\n \"Malaysia\",\n \"Maldives\",\n \"Mali\",\n \"Malta\",\n \"Mauritania\",\n \"Mauritius\",\n \"Mexico\",\n \"Moldova\",\n \"Mongolia\",\n \"Montenegro\",\n \"Morocco\",\n \"Mozambique\",\n \"Myanmar\",\n \"Namibia\",\n \"Nepal\",\n \"Netherlands\",\n \"New Zealand\",\n \"Nicaragua\",\n \"Niger\",\n \"Nigeria\",\n \"North Cyprus\",\n \"North Macedonia\",\n \"Norway\",\n \"Pakistan\",\n \"Palestinian Territories\",\n \"Panama\",\n \"Paraguay\",\n \"Peru\",\n \"Philippines\",\n \"Poland\",\n \"Portugal\",\n \"Romania\",\n \"Russia\",\n \"Rwanda\",\n \"Saudi Arabia\",\n \"Senegal\",\n \"Serbia\",\n \"Sierra Leone\",\n \"Singapore\",\n \"Slovakia\",\n \"Slovenia\",\n \"South Africa\",\n \"South Korea\",\n \"Spain\",\n \"Sri Lanka\",\n \"Swaziland\",\n \"Sweden\",\n \"Switzerland\",\n \"Taiwan Province of China\",\n \"Tajikistan\",\n \"Tanzania\",\n \"Thailand\",\n \"Togo\",\n \"Tunisia\",\n \"Turkey\",\n \"Turkmenistan\",\n \"Uganda\",\n \"Ukraine\",\n \"United Arab Emirates\",\n \"United Kingdom\",\n \"United States\",\n \"Uruguay\",\n \"Uzbekistan\",\n \"Venezuela\",\n \"Vietnam\",\n \"Yemen\",\n \"Zambia\",\n \"Zimbabwe\"\n ],\n \"label\": \"Select countries of interest\",\n \"model_id\": \"7b821dda8d054ed19d8218795db8029a\",\n \"code_uid\": \"MultiSelect.0.40.16.2-rand99984181\",\n \"url_key\": \"\",\n \"disabled\": false,\n \"hidden\": false\n}", "application/vnd.jupyter.widget-view+json": { "model_id": "7b821dda8d054ed19d8218795db8029a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.MultiSelect" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/mercury+json": "{\n \"widget\": \"Select\",\n \"value\": \"Happiness_Index\",\n \"choices\": [\n \"Country_Name\",\n \"Regional_Indicator\",\n \"Year\",\n \"Happiness_Index\",\n \"Logged_Gdp_Per_Capita\",\n \"Social_Support\",\n \"Healthy_Life_Expectancy\",\n \"Freedom_To_Make_Life_Choices\",\n \"Generosity\",\n \"Perceptions_Of_Corruption\",\n \"Happiness_Level\"\n ],\n \"label\": \"Select column of interest\",\n \"model_id\": \"93500d44aed24c3b8726701995f048ad\",\n \"code_uid\": \"Select.0.40.16.3-rand185c0b99\",\n \"url_key\": \"\",\n \"disabled\": false,\n \"hidden\": false\n}", "application/vnd.jupyter.widget-view+json": { "model_id": "93500d44aed24c3b8726701995f048ad", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Select" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "countries_of_interest = [\"United Arab Emirates\", \"Egypt\", \"Finland\", \"United States\", \"Russia\"]\n", "countries_select = mr.MultiSelect(value=countries_of_interest, choices=happiness_df[\"Country_Name\"].unique(), label=\"Select countries of interest\")\n", "column_select = mr.Select(value=\"Happiness_Index\", choices=happiness_df.columns, label=\"Select column of interest\")" ] }, { "cell_type": "code", "execution_count": 39, "id": "3e18d9de-045f-4894-9e20-18d2a5a92289", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries_df = happiness_df[happiness_df[\"Country_Name\"].isin(countries_select.value)]\n", "(\n", "ggplot(countries_df)\n", " + geom_point(aes(x=\"Year\", y=\"Country_Name\", fill=column_select.value, size=column_select.value), shape=25, color=\"black\")\n", " + scale_fill_gradient2(midpoint=happiness_df[column_select.value].mean(), low='#d7191c', mid='yellow', high='#2b83ba', guide=guide_colorbar(nbin=5, barwidth=10))\n", "+ scale_size(range=[3, 10], guide='none')\n", "+ ggsize(900, 500)\n", " + scale_x_continuous(breaks=years)\n", " +theme(axis_text_x=element_text(angle=70))\n", ")" ] }, { "cell_type": "code", "execution_count": 40, "id": "5a50cc04-524a-4941-904c-3a353b666d2c", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/mercury+json": "{\n \"widget\": \"Select\",\n \"value\": \"Happiness_Index\",\n \"choices\": [\n \"Country_Name\",\n \"Regional_Indicator\",\n \"Year\",\n \"Happiness_Index\",\n \"Logged_Gdp_Per_Capita\",\n \"Social_Support\",\n \"Healthy_Life_Expectancy\",\n \"Freedom_To_Make_Life_Choices\",\n \"Generosity\",\n \"Perceptions_Of_Corruption\",\n \"Happiness_Level\"\n ],\n \"label\": \"Select column of interest to view worldwide\",\n \"model_id\": \"a547fa06676443c78beb050e4e5d05f2\",\n \"code_uid\": \"Select.0.40.16.1-rand84ee96d1\",\n \"url_key\": \"\",\n \"disabled\": false,\n \"hidden\": false\n}", "application/vnd.jupyter.widget-view+json": { "model_id": "a547fa06676443c78beb050e4e5d05f2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "mercury.Select" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "column_select = mr.Select(value=\"Happiness_Index\", choices=happiness_df.columns, label=\"Select column of interest to view worldwide\")" ] }, { "cell_type": "code", "execution_count": 41, "id": "6d4140a2-40c3-499c-872a-a86586bcd04e", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "happiness_by_country = happiness_df.groupby([\"Country_Name\", \"Regional_Indicator\"], observed=True).agg({\n", " column_select.value: \"mean\",\n", "}).reset_index()\n", "happiness_by_country = happiness_by_country[happiness_by_country[\"Country_Name\"] != \"Hong Kong S.A.R. of China\"]\n", "happiness_by_country[\"Country_Name\"] = happiness_by_country[\"Country_Name\"].astype('object')\n", "happiness_by_country[\"Country_Name\"] = happiness_by_country[\"Country_Name\"].replace({\"North Cyprus\": \"Cyprus\", \"Palestinian Territories\": \"Palestine\", \"Taiwan Province of China\": \"Taiwan\"})\n", "\n", "countries_gcoder = geocode_countries(happiness_by_country[\"Country_Name\"])\n", "\n", "(ggplot() \n", " + geom_livemap(location=[53, 24],\n", " zoom=4)\n", " + geom_polygon(aes(fill=column_select.value),\n", " data=happiness_by_country,\n", " map=countries_gcoder.get_boundaries(),\n", " map_join=[[\"Country_Name\"], [\"country\"]],\n", " alpha=0.7,\n", " color=\"gray\",\n", " size=0.5,\n", " tooltips=layer_tooltips().line('@Country_Name').line(f'{column_select.value}:| @{column_select.value}'))\n", " + theme(legend_position='top')\n", " + ggsize(900, 600)\n", " + scale_fill_gradient2(midpoint=happiness_by_country[column_select.value].mean(), low='red', mid='yellow', high='green', guide=guide_colorbar(nbin=5))\n", ")" ] }, { "attachments": {}, "cell_type": "markdown", "id": "5d776217-7b29-4699-b98b-7b6fb1ed2e53", "metadata": {}, "source": [ "\n", "## Model Training and Evaluation" ] }, { "attachments": {}, "cell_type": "markdown", "id": "56cb0ac2-47a7-4de9-a9d9-b1af278c02db", "metadata": {}, "source": [ "### Feature Engineering: Prepare data to be digestible appropriately by machine learning models for training:" ] }, { "cell_type": "code", "execution_count": 42, "id": "8a8f11aa-af44-445f-a118-567bcc005a03", "metadata": {}, "outputs": [], "source": [ "def process_df_features(df: pd.DataFrame, ignore: Union[list, tuple]=(), drop: Union[list, tuple]=()):\n", " df = df.copy()\n", " numerical_preprocessor = StandardScaler()\n", " categorical_preprocessor = OneHotEncoder(sparse_output=False, handle_unknown=\"ignore\")\n", " for col in df.columns:\n", " if col in drop:\n", " df = df.drop(col, axis=1)\n", "\n", " elif col in ignore:\n", " continue\n", "\n", " elif df[col].dtype in (int, 'category', bool):\n", " unique_vals = np.sort(df[col].unique())\n", " print(f'Column \"{col}`s\" unique values are: {unique_vals}')\n", "\n", " df[col] = df[col].astype('category')\n", " onehot_enc = categorical_preprocessor.fit_transform(df[[col]])\n", " onehot_df = pd.DataFrame(onehot_enc, columns=[f\"{col}_{val}\" for val in unique_vals])\n", " df = pd.concat([df.drop(col, axis=1), onehot_df], axis=1)\n", "\n", " elif df[col].dtype == float:\n", " df[col] = numerical_preprocessor.fit_transform(df[[col]])\n", " return df" ] }, { "cell_type": "code", "execution_count": 43, "id": "cbfceef0-a549-4636-a5e5-00b8fab5a9b8", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Column \"Regional_Indicator`s\" unique values are: ['Central and Eastern Europe' 'Commonwealth of Independent States' 'East Asia' 'Latin America and Caribbean' 'Middle East and North Africa' 'North America and ANZ' 'South Asia' 'Southeast Asia' 'Sub-Saharan Africa' 'Western Europe']\n", "Column \"Year`s\" unique values are: [2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021]\n", "\n", "Start training our models:\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/4 [00:00