{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Malnutrition in the World\n", "\n", "Data comes from [here](https://www.kaggle.com/ruchi798/malnutrition-across-the-globe)." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:39:29.398325Z", "iopub.status.busy": "2024-04-17T07:39:29.398226Z", "iopub.status.idle": "2024-04-17T07:39:29.936558Z", "shell.execute_reply": "2024-04-17T07:39:29.936229Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import geopandas as gpd\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.model_selection import GridSearchCV\n", "\n", "from lets_plot import *\n", "from lets_plot.bistro.corr import *" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:39:29.938434Z", "iopub.status.busy": "2024-04-17T07:39:29.937990Z", "iopub.status.idle": "2024-04-17T07:39:29.940811Z", "shell.execute_reply": "2024-04-17T07:39:29.940502Z" } }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "LetsPlot.setup_html()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:39:29.953707Z", "iopub.status.busy": "2024-04-17T07:39:29.953533Z", "iopub.status.idle": "2024-04-17T07:39:30.360211Z", "shell.execute_reply": "2024-04-17T07:39:30.359779Z" } }, "outputs": [], "source": [ "cwa_df = pd.read_csv(\"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/\"\n", " \"master/data/malnutrition/country_wise_average.csv\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:39:30.361525Z", "iopub.status.busy": "2024-04-17T07:39:30.361335Z", "iopub.status.idle": "2024-04-17T07:39:30.795324Z", "shell.execute_reply": "2024-04-17T07:39:30.794902Z" } }, "outputs": [], "source": [ "me_df = pd.read_csv(\"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/\"\n", " \"master/data/malnutrition/malnutrition_estimates.csv\")\n", "me_df = me_df.sort_values(by='Year', ascending=False).drop_duplicates(subset='Country')\n", "me_df = me_df[['Country', 'ISO code', 'LDC', 'LIFD', 'LLDC or SID2']]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:39:30.796726Z", "iopub.status.busy": "2024-04-17T07:39:30.796442Z", "iopub.status.idle": "2024-04-17T07:39:30.799175Z", "shell.execute_reply": "2024-04-17T07:39:30.798915Z" } }, "outputs": [], "source": [ "df = cwa_df.merge(me_df, on='Country')\n", "df.Country = df.Country.apply(lambda country_name: country_name.capitalize())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### A Bit of Correlation Analysis\n", "\n", "Let's look at the correlation coefficients of random variables that correspond to dataframe columns." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:39:30.800273Z", "iopub.status.busy": "2024-04-17T07:39:30.800149Z", "iopub.status.idle": "2024-04-17T07:39:30.807230Z", "shell.execute_reply": "2024-04-17T07:39:30.806980Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "corr_plot(df.corr(numeric_only=True)).tiles().palette_RdBu().build() + ggsize(600, 600)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Choropleth Maps\n", "\n", "For each map, the total of children with this disorder in red countries equals to that in blue countries." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:39:30.808360Z", "iopub.status.busy": "2024-04-17T07:39:30.808233Z", "iopub.status.idle": "2024-04-17T07:39:39.266675Z", "shell.execute_reply": "2024-04-17T07:39:39.266141Z" } }, "outputs": [], "source": [ "cat_cols = ['Income Classification', 'LDC', 'LIFD', 'LLDC or SID2']\n", "num_cols = ['U5 Population (\\'000s)']\n", "features = cat_cols + num_cols\n", "targets = ['Severe Wasting', 'Wasting', 'Overweight', 'Stunting', 'Underweight']\n", "targets = list(df[targets].isnull().sum().sort_values().keys())\n", "\n", "for target in targets:\n", " df_train = df[~df[target].isnull()]\n", " df_test = df[df[target].isnull()]\n", " X, y = df_train[features], df_train[target]\n", " X_test = df_test[features]\n", "\n", " parameters = dict(n_estimators=[10, 20, 30, 60], max_depth=[1, 2, 3, 4, 5, 6, 7])\n", " model = GridSearchCV(RandomForestRegressor(), parameters)\n", " model.fit(X, y)\n", " y_pred = model.predict(X_test)\n", " \n", " df[target] = df[target].fillna(pd.Series(y_pred, index=X_test.index))\n", "\n", " features = features + [target]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:39:39.268222Z", "iopub.status.busy": "2024-04-17T07:39:39.268012Z", "iopub.status.idle": "2024-04-17T07:39:39.270600Z", "shell.execute_reply": "2024-04-17T07:39:39.270352Z" } }, "outputs": [], "source": [ "def get_naturalearth_data(data_type=\"admin_0_countries\", columns=[\"NAME\", \"geometry\"]):\n", " import shapefile\n", " from shapely.geometry import shape\n", "\n", " naturalearth_url = \"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/\" + \\\n", " \"data/naturalearth/{0}/data.shp?raw=true\".format(data_type)\n", " sf = shapefile.Reader(naturalearth_url)\n", "\n", " gdf = gpd.GeoDataFrame(\n", " [\n", " dict(zip([field[0] for field in sf.fields[1:]], record))\n", " for record in sf.records()\n", " ],\n", " geometry=[shape(s) for s in sf.shapes()]\n", " )[columns]\n", " gdf.columns = [col.lower() for col in gdf.columns]\n", "\n", " return gdf" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:39:39.271684Z", "iopub.status.busy": "2024-04-17T07:39:39.271545Z", "iopub.status.idle": "2024-04-17T07:39:39.274453Z", "shell.execute_reply": "2024-04-17T07:39:39.274219Z" } }, "outputs": [], "source": [ "def plot_world_decomposition(df, world_gdf, target):\n", " target_abs = target + ' ABS'\n", " target_bh = target + ' BH'\n", "\n", " df_copy = df.copy()\n", " df_copy[target_abs] = df_copy[target] * df_copy['U5 Population (\\'000s)'] / 100\n", " df_copy = df_copy.sort_values(by=target_abs, ascending=False)\n", " df_copy[target_bh] = df_copy[target_abs].cumsum() < df_copy[target_abs].sum() / 2\n", " df_copy.loc[df_copy[target_bh].ne(True).idxmax(), target_bh] = True\n", " df_copy[target_bh] = df_copy[target_bh].apply(lambda r: 'Bigger half' if r else 'Smaller half')\n", "\n", " sifted_df = df_copy[['ISO code', target_abs, target_bh]]\n", " merged_df = gpd.GeoDataFrame(sifted_df.merge(world_gdf, how='right', left_on='ISO code', right_on='iso_a3'))\n", " merged_df[target_abs] = merged_df[target_abs].fillna(0)\n", " merged_df[target_bh] = merged_df[target_bh].fillna('Not in statistics')\n", "\n", " return ggplot() + \\\n", " geom_polygon(aes(fill=target_bh, size='name', alpha=target_abs), \\\n", " data=merged_df, color='black', \\\n", " tooltips=layer_tooltips().title('@name')\\\n", " .line(\"{0}|@{{{1}}}\".format(target.lower(), target_abs))) + \\\n", " scale_size(name='Country', range=[.3, .3]) + \\\n", " scale_alpha(name=target, range=[.6, 1]) + \\\n", " scale_fill_manual(name=target, values=['#0571b0', '#bababa', '#ca0020']) + \\\n", " ggtitle('%s in the World' % target) + \\\n", " ggsize(600, 450) + \\\n", " theme_void() + theme(legend_position='none')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:39:39.275473Z", "iopub.status.busy": "2024-04-17T07:39:39.275389Z", "iopub.status.idle": "2024-04-17T07:39:39.939997Z", "shell.execute_reply": "2024-04-17T07:39:39.939600Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameiso_a3continentpop_estgdp_mdgeometry
0FijiFJIOceania889953.05496MULTIPOLYGON (((180.00000 -16.06713, 180.00000...
1TanzaniaTZAAfrica58005463.063177POLYGON ((33.90371 -0.95000, 34.07262 -1.05982...
2W. SaharaESHAfrica603253.0907POLYGON ((-8.66559 27.65643, -8.66512 27.58948...
3CanadaCANNorth America37589262.01736425MULTIPOLYGON (((-122.84000 49.00000, -122.9742...
4United States of AmericaUSANorth America328239523.021433226MULTIPOLYGON (((-122.84000 49.00000, -120.0000...
\n", "
" ], "text/plain": [ " name iso_a3 continent pop_est gdp_md \\\n", "0 Fiji FJI Oceania 889953.0 5496 \n", "1 Tanzania TZA Africa 58005463.0 63177 \n", "2 W. Sahara ESH Africa 603253.0 907 \n", "3 Canada CAN North America 37589262.0 1736425 \n", "4 United States of America USA North America 328239523.0 21433226 \n", "\n", " geometry \n", "0 MULTIPOLYGON (((180.00000 -16.06713, 180.00000... \n", "1 POLYGON ((33.90371 -0.95000, 34.07262 -1.05982... \n", "2 POLYGON ((-8.66559 27.65643, -8.66512 27.58948... \n", "3 MULTIPOLYGON (((-122.84000 49.00000, -122.9742... \n", "4 MULTIPOLYGON (((-122.84000 49.00000, -120.0000... " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "world_gdf = get_naturalearth_data(columns=[\"NAME\", \"ISO_A3\", \"CONTINENT\", \"POP_EST\", \"GDP_MD\", \"geometry\"])\n", "world_gdf.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:39:39.941155Z", "iopub.status.busy": "2024-04-17T07:39:39.941077Z", "iopub.status.idle": "2024-04-17T07:39:40.002128Z", "shell.execute_reply": "2024-04-17T07:39:40.001750Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "plot_world_decomposition(df, world_gdf, 'Severe Wasting')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:39:40.004320Z", "iopub.status.busy": "2024-04-17T07:39:40.004086Z", "iopub.status.idle": "2024-04-17T07:39:40.056187Z", "shell.execute_reply": "2024-04-17T07:39:40.055727Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "plot_world_decomposition(df, world_gdf, 'Wasting')" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:39:40.058063Z", "iopub.status.busy": "2024-04-17T07:39:40.057952Z", "iopub.status.idle": "2024-04-17T07:39:40.110938Z", "shell.execute_reply": "2024-04-17T07:39:40.110645Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "plot_world_decomposition(df, world_gdf, 'Overweight')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:39:40.113424Z", "iopub.status.busy": "2024-04-17T07:39:40.113337Z", "iopub.status.idle": "2024-04-17T07:39:40.167438Z", "shell.execute_reply": "2024-04-17T07:39:40.167135Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "plot_world_decomposition(df, world_gdf, 'Stunting')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "execution": { "iopub.execute_input": "2024-04-17T07:39:40.169646Z", "iopub.status.busy": "2024-04-17T07:39:40.169448Z", "iopub.status.idle": "2024-04-17T07:39:40.221899Z", "shell.execute_reply": "2024-04-17T07:39:40.221584Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "plot_world_decomposition(df, world_gdf, 'Underweight')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 4 }