{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<details><summary><b>LICENSE</b></summary>\n",
    "\n",
    "Copyright 2015 Donne Martin\n",
    "\n",
    "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "you may not use this file except in compliance with the License.\n",
    "You may obtain a copy of the License at\n",
    "\n",
    "   http://www.apache.org/licenses/LICENSE-2.0\n",
    "\n",
    "Unless required by applicable law or agreed to in writing, software\n",
    "distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "See the License for the specific language governing permissions and\n",
    "limitations under the License.\n",
    "\n",
    "</details>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Matplotlib applied\n",
    "\n",
    "* Applying Matplotlib Visualizations to Kaggle: Titanic\n",
    "* Bar Plots, Histograms, subplot2grid\n",
    "* Normalized Plots\n",
    "* Scatter Plots, subplots\n",
    "* Kernel Density Estimation Plots\n",
    "\n",
    "# Challenge\n",
    "* This is an assignment to learn about Data Cleaning Visualization and plot\n",
    "\n",
    "\n",
    "## Applying Matplotlib Visualizations to Kaggle: Titanic\n",
    "Prepare the titanic data to plot:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pylab as plt\n",
    "import seaborn\n",
    "import pytest\n",
    "import ipytest\n",
    "import unittest\n",
    "\n",
    "\n",
    "ipytest.autoconfig()\n",
    "# Set the global default size of matplotlib figures\n",
    "plt.rc(\"figure\", figsize=(10, 5))\n",
    "# Set seaborn aesthetic parameters to defaults\n",
    "seaborn.set()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv(\"../../assets/data/titanic_train.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def label_encode(df, column_name, encoded_column_name):\n",
    "    \"\"\"Label encode one column of a Dataframe.\n",
    "\n",
    "    Args:\n",
    "        df (DataFrame): a data structure\n",
    "        column_name (string): the column name to encode\n",
    "        encoded_column_name (string): the new column name for the encoded result\n",
    "    \"\"\"\n",
    "    if df is None:\n",
    "        raise Exception(\"df cannot be None.\")\n",
    "\n",
    "    column = np.____(df[____].unique())\n",
    "\n",
    "    # Generate a mapping of column from a string to a number representation\n",
    "    column_value_mapping = ____(____(column, range(0, len(column))))\n",
    "\n",
    "    # Transform column from a string to a number representation\n",
    "    df[____] = df[column_name].map(column_value_mapping).astype(int)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h5><font color=blue>Check result by executing below... 📝</font></h5>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%ipytest -qq\n",
    "\n",
    "def create_test_df():\n",
    "    return pd.DataFrame({\"c1\": [\"male\", \"female\", \"female\", \"male\", \"male\"]})\n",
    "\n",
    "\n",
    "class TestLabelEncode(unittest.TestCase):\n",
    "    def test_label_encode_happy_case(self):\n",
    "        # assign\n",
    "        test_df = create_test_df()\n",
    "        expected_result = pd.DataFrame({\"c1\": [1, 0, 0, 1, 1]}, dtype=int)\n",
    "\n",
    "        # act\n",
    "        actual_result = label_encode(test_df, \"c1\", \"encoded_c1\")\n",
    "\n",
    "        # assert\n",
    "        assert test_df[\"encoded_c1\"].equals(expected_result[\"c1\"])\n",
    "\n",
    "    def test_label_encode_with_none_df(self):\n",
    "        # act & assert\n",
    "        with pytest.raises(Exception):\n",
    "            label_encode(None, \"c1\", \"encoded_c1\")\n",
    "\n",
    "    def test_label_encode_with_empty_df(self):\n",
    "        # act & assert\n",
    "        with pytest.raises(Exception):\n",
    "            label_encode(pd.DataFrame(), \"c1\", \"encoded_c1\")\n",
    "\n",
    "    def test_label_encode_invalid_column_name(self):\n",
    "        # act & assert\n",
    "        with pytest.raises(Exception):\n",
    "            label_encode(test_df, \"invalid_column_name\", \"encoded_c1\")\n",
    "\n",
    "    def test_label_encode_invalid_encoded_column_name(self):\n",
    "        # act & assert\n",
    "        with pytest.raises(Exception):\n",
    "            label_encode(test_df, \"c1\", \"invalid_column_name\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "    \n",
    "<details><summary>👩‍💻 <b>Hint</b></summary>\n",
    "\n",
    "You can consider to use <code>numpy.sort</code>.<br>Refer to <a href=\"https://pandas.pydata.org/docs/user_guide/indexing.html\">indexin and selecting data on</a> <code>pandas.DataFrame</code>.<br>Refer to <a href=\"https://docs.python.org/3/library/functions.html#func-dict\">function dict() on</a> <code>python</code>.<br>Refer to <a href=\"https://docs.python.org/3/library/functions.html#zip\">function zip() on</a> <code>python</code>.\n",
    "\n",
    "</details>\n",
    "\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def one_hot_encode(df, column_name, encoded_column_name_prefix):\n",
    "    \"\"\"Transforms a column in a DataFrame from a string to dummy variables.\n",
    "    \n",
    "    Args:\n",
    "        df (DataFrame): a data structure\n",
    "        column_name (string): the name of the column to be encoded\n",
    "        encoded_column_name_prefix (string): the prefix to be added to the names of the encoded columns\n",
    "    \"\"\"\n",
    "    return pd.____(\n",
    "        [df, pd.____(df[____], prefix=____, dtype=\"int64\")], axis=1\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h5><font color=blue>Check result by executing below... 📝</font></h5>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%ipytest -qq\n",
    "\n",
    "from pandas.testing import assert_frame_equal\n",
    "\n",
    "def create_test_df():\n",
    "    return pd.DataFrame({\"Embarked\": [\"S\", \"C\", \"Q\", \"S\", \"C\"]})\n",
    "\n",
    "\n",
    "class TestOneHotEncode(unittest.TestCase):\n",
    "    def test_one_hot_encode_happy_case(self):\n",
    "        # assign\n",
    "        test_df = create_test_df()\n",
    "        expected_result = pd.DataFrame(\n",
    "            {\n",
    "                \"Embarked\": [\"S\", \"C\", \"Q\", \"S\", \"C\"],\n",
    "                \"Embarked_Val_C\": [0, 1, 0, 0, 1],\n",
    "                \"Embarked_Val_Q\": [0, 0, 1, 0, 0],\n",
    "                \"Embarked_Val_S\": [1, 0, 0, 1, 0],\n",
    "            }\n",
    "        )\n",
    "\n",
    "        # act\n",
    "        actual_result = one_hot_encode(test_df, \"Embarked\", \"Embarked_Val\")\n",
    "\n",
    "        # assert\n",
    "        assert_frame_equal(actual_result, expected_df)\n",
    "\n",
    "    def test_one_hot_encode_with_none_df(self):\n",
    "        # act & assert\n",
    "        with pytest.raises(Exception):\n",
    "            one_hot_encode(None, \"Embarked\", \"Embarked_Val\")\n",
    "\n",
    "    def test_one_hot_encode_with_empty_df(self):\n",
    "        # act & assert\n",
    "        with pytest.raises(Exception):\n",
    "            one_hot_encode(pd.DataFrame(), \"Embarked\", \"Embarked_Val\")\n",
    "\n",
    "    def test_one_hot_encode_invalid_column_name(self):\n",
    "        # act & assert\n",
    "        with pytest.raises(Exception):\n",
    "            one_hot_encode(test_df, \"invalid_column_name\", \"Embarked_Val\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "    \n",
    "<details><summary>👩‍💻 <b>Hint</b></summary>\n",
    "\n",
    "Refer to <a href=\"https://pandas.pydata.org/docs/user_guide/indexing.html\">indexin and selecting data on</a> <code>pandas.DataFrame</code>.<br>\n",
    "You can consider to use <code>pandas.concat</code> and <code>pandas.get_dummies</code>.\n",
    "\n",
    "</details>\n",
    "\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def impute_with_mean(df, column_name, imputed_column_name):\n",
    "    \"\"\"Impute the gaps with the mean.\n",
    "\n",
    "    Args:\n",
    "        df (DataFrame): a data structure\n",
    "        column_name (string): the column name to impute\n",
    "        encoded_column_name (string): the new column name for the imputed result\n",
    "    \"\"\"\n",
    "    if len(df[df[____].isnull()]) > 0:\n",
    "        imputed_column_name = df[____].____()\n",
    "        df.replace({None: imputed_column_name}, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h5><font color=blue>Check result by executing below... 📝</font></h5>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%ipytest -qq\n",
    "\n",
    "def create_test_df():\n",
    "    return pd.DataFrame(\n",
    "        {\n",
    "            \"price\": [9, 8, 1, None, None],\n",
    "            \"price_add_average\": [None, None, None, None, None],\n",
    "        }\n",
    "    )\n",
    "\n",
    "\n",
    "class TestCleanFare(unittest.TestCase):\n",
    "    def test_impute_with_mean_happy_case(self):\n",
    "        # assign\n",
    "        test_df = create_test_df()\n",
    "        expected_result = pd.DataFrame({\"result\": [6, 6, 6.0, 6.0, 6.0]}, dtype=float)\n",
    "\n",
    "        # act\n",
    "        impute_with_mean(test_df, \"price\", \"price_add_average\")\n",
    "\n",
    "        # assert\n",
    "        assert test_df[\"price_add_average\"].equals(expected_result[\"result\"])\n",
    "\n",
    "    def test_impute_with_mean_with_none_df(self):\n",
    "        # act & assert\n",
    "        with pytest.raises(Exception):\n",
    "            impute_with_mean(None, \"price\", \"price_add_average\")\n",
    "\n",
    "    def test_impute_with_mean_with_empty_df(self):\n",
    "        # act & assert\n",
    "        with pytest.raises(Exception):\n",
    "            impute_with_mean(pd.DataFrame(), \"price\", \"price_add_average\")\n",
    "\n",
    "    def test_impute_with_mean_invalid_column_name(self):\n",
    "        # act & assert\n",
    "        with pytest.raises(Exception):\n",
    "            impute_with_mean(test_df, \"invalid_column_name\", \"price_add_average\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "    \n",
    "<details><summary>👩‍💻 <b>Hint</b></summary>\n",
    "\n",
    "You can consider to use <code>pandas.DataFrame.mean</code> and refer to <a href=\"https://pandas.pydata.org/docs/user_guide/indexing.html\">indexin and selecting data on</a> <code>pandas.DataFrame</code>.\n",
    "\n",
    "</details>\n",
    "\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def impute_with_median(df, column_name, column_value_fill, column_value):\n",
    "    \"\"\"Impute the missing ages with the median.\n",
    "\n",
    "    Args:\n",
    "        df (DataFrame): a data structure\n",
    "        column_name (string): the column name to impute\n",
    "        column_value_fill (string): the new column name for the imputed result\n",
    "        column_value (string): determine the column name typical for each passenger class by sex value\n",
    "    \"\"\"\n",
    "    df[column_value_fill] = df[____]\n",
    "    df[column_value_fill] = (\n",
    "        df[____]\n",
    "        .groupby([df[____], df[\"Pclass\"]], group_keys=False)\n",
    "        .apply(lambda x: x.____(x.____()))\n",
    "    )\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h5><font color=blue>Check result by executing below... 📝</font></h5>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%ipytest -qq\n",
    "\n",
    "from pandas.testing import assert_frame_equal\n",
    "\n",
    "def create_test_df():\n",
    "    return pd.DataFrame(\n",
    "        {\n",
    "            \"Age\": [\n",
    "                1,\n",
    "                2,\n",
    "                3,\n",
    "                4,\n",
    "                5,\n",
    "                6,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                7,\n",
    "                8,\n",
    "                9,\n",
    "                10,\n",
    "                11,\n",
    "                12,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "            ],\n",
    "            \"Age_median_impute\": [\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "                None,\n",
    "            ],\n",
    "            \"Pclass\": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3],\n",
    "            \"Sex_Val\": [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
    "        }\n",
    "    )\n",
    "\n",
    "\n",
    "class TestCleanFare(unittest.TestCase):\n",
    "    def test_impute_with_median_happy_case(self):\n",
    "        # assign\n",
    "        test_df = create_test_df()\n",
    "        expected_df = pd.DataFrame(\n",
    "            {\n",
    "                \"Age\": [\n",
    "                    1,\n",
    "                    2,\n",
    "                    3,\n",
    "                    4,\n",
    "                    5,\n",
    "                    6,\n",
    "                    None,\n",
    "                    None,\n",
    "                    None,\n",
    "                    7,\n",
    "                    8,\n",
    "                    9,\n",
    "                    10,\n",
    "                    11,\n",
    "                    12,\n",
    "                    None,\n",
    "                    None,\n",
    "                    None,\n",
    "                ],\n",
    "                \"Age_median_impute\": [\n",
    "                    1.0,\n",
    "                    2.0,\n",
    "                    3.0,\n",
    "                    4.0,\n",
    "                    5.0,\n",
    "                    6.0,\n",
    "                    2.5,\n",
    "                    3.5,\n",
    "                    4.5,\n",
    "                    7.0,\n",
    "                    8.0,\n",
    "                    9.0,\n",
    "                    10.0,\n",
    "                    11.0,\n",
    "                    12.0,\n",
    "                    8.5,\n",
    "                    9.5,\n",
    "                    10.5,\n",
    "                ],\n",
    "                \"Pclass\": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3],\n",
    "                \"Sex_Val\": [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
    "            }\n",
    "        )\n",
    "\n",
    "        # act\n",
    "        impute_with_median(test_df, \"Age\", \"Age_median_impute\", \"Sex_Val\")\n",
    "\n",
    "        # assert\n",
    "        assert_frame_equal(test_df, expected_df)\n",
    "\n",
    "    def test_impute_with_median_with_none_df(self):\n",
    "        # act & assert\n",
    "        with pytest.raises(Exception):\n",
    "            impute_with_median(None, \"Age\", \"Age_median_impute\", \"Sex_Val\")\n",
    "\n",
    "    def test_impute_with_median_with_empty_df(self):\n",
    "        # act & assert\n",
    "        with pytest.raises(Exception):\n",
    "            impute_with_median(pd.DataFrame(), \"Age\", \"Age_median_impute\", \"Sex_Val\")\n",
    "\n",
    "    def test_impute_with_median_invalid_column_name(self):\n",
    "        # act & assert\n",
    "        with pytest.raises(Exception):\n",
    "            impute_with_median(test_df, \"invalid_column_name\", \"Age_median_impute\", \"Sex_Val\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "    \n",
    "<details><summary>👩‍💻 <b>Hint</b></summary>\n",
    "\n",
    "You can consider to use <code>pandas.DataFrame.fillna</code> and <code>pandas.DataFrame.median</code><br>Refer to <a href=\"https://pandas.pydata.org/docs/user_guide/indexing.html\">indexin and selecting data on</a> <code>pandas.DataFrame</code>.\n",
    "\n",
    "</details>\n",
    "\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_data(df):\n",
    "    # Fill in missing values of Embarked\n",
    "    # Since the vast majority of passengers embarked in 'S': 3,\n",
    "    # we assign the missing values in Embarked to 'S':\n",
    "    df[\"Embarked\"] = df[\"Embarked\"].fillna(\"S\")\n",
    "    label_encode(df, \"Sex\", \"Sex_Val\")\n",
    "\n",
    "    # Get the unique values of Embarked\n",
    "    label_encode(df, \"Embarked\", \"Embarked_Val\")\n",
    "\n",
    "    # Transform Embarked from a string to dummy variables\n",
    "    df = one_hot_encode(df, \"Embarked\", \"Embarked_Val\")\n",
    "\n",
    "    # Fill in missing values of Fare with the average Fare\n",
    "    impute_with_mean(df, \"Fare\", \"Fare_add_average\")\n",
    "\n",
    "    # To keep Age intact, make a copy of it called AgeFill\n",
    "    # that we will use to fill in the missing ages:\n",
    "    # Determine the Age typical for each passenger class by Sex_Val.\n",
    "    # We'll use the median instead of the mean because the Age\n",
    "    # histogram seems to be right skewed.\n",
    "    impute_with_median(df, \"Age\", \"AgeFill\", \"Sex_Val\")\n",
    "\n",
    "    # Define a new feature FamilySize that is the sum of\n",
    "    # Parch (number of parents or children on board) and\n",
    "    # SibSp (number of siblings or spouses):\n",
    "    df[\"FamilySize\"] = df[\"Parch\"] + df[\"SibSp\"]\n",
    "\n",
    "    return df\n",
    "\n",
    "\n",
    "df_train = clean_data(df_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bar Plots, Histograms, subplot2grid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Size of matplotlib figures that contain subplots\n",
    "figsize_with_subplots = (10, 10)\n",
    "\n",
    "# Set up a grid of plots\n",
    "fig = plt.figure(figsize=figsize_with_subplots)\n",
    "fig_dims = (3, 2)\n",
    "\n",
    "# Plot death and survival counts\n",
    "plt.subplot2grid(fig_dims, (0, 0))\n",
    "\n",
    "\n",
    "def create_sub_plot_2_grid(\n",
    "    df, column_name, plot_title, plot_kind, plot_color=\"b\", plot_align=\"center\"\n",
    "):\n",
    "    df[____].____().____(\n",
    "        kind=plot_kind, title=plot_title, color=plot_color, align=plot_align\n",
    "    )\n",
    "\n",
    "\n",
    "create_sub_plot_2_grid(\n",
    "    df_train, \"Survived\", \"Death and Survival Counts\", \"bar\", \"r\", \"center\"\n",
    ")\n",
    "\n",
    "# Plot Pclass counts\n",
    "plt.subplot2grid(fig_dims, (0, 1))\n",
    "\n",
    "create_sub_plot_2_grid(df_train, \"Pclass\", \"Passenger Class Counts\", \"bar\")\n",
    "# Plot Sex counts\n",
    "plt.subplot2grid(fig_dims, (1, 0))\n",
    "\n",
    "create_sub_plot_2_grid(df_train, \"Sex\", \"Gender Counts\", \"bar\")\n",
    "plt.xticks(rotation=0)\n",
    "\n",
    "# Plot Embarked counts\n",
    "plt.subplot2grid(fig_dims, (1, 1))\n",
    "\n",
    "create_sub_plot_2_grid(df_train, \"Embarked\", \"Ports of Embarkation Counts\", \"bar\")\n",
    "\n",
    "# Plot the Age histogram\n",
    "plt.subplot2grid(fig_dims, (2, 0))\n",
    "df_train[\"Age\"].hist()\n",
    "plt.title(\"Age Histogram\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "    \n",
    "<details><summary>👩‍💻 <b>Hint</b></summary>\n",
    "\n",
    "Refer to <a href=\"https://pandas.pydata.org/docs/user_guide/indexing.html\">indexin and selecting data on</a> <code>pandas.DataFrame</code>.<br>\n",
    "You can consider to use <code>pandas.Series.value_counts</code> and <code>pandas.DataFrame.plot</code>.\n",
    "\n",
    "</details>\n",
    "\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the unique values of Embarked and its maximum\n",
    "family_sizes = np.____(df_train[\"FamilySize\"].unique())\n",
    "family_size_max = max(family_sizes)\n",
    "\n",
    "df1 = df_train[df_train[\"Survived\"] == 0][\"FamilySize\"]\n",
    "df2 = df_train[df_train[\"Survived\"] == 1][\"FamilySize\"]\n",
    "plt.____([df1, df2], bins=family_size_max + 1, range=(0, family_size_max), stacked=True)\n",
    "plt.legend((\"Died\", \"Survived\"), loc=\"best\")\n",
    "plt.title(\"Survivors by Family Size\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "    \n",
    "<details><summary>👩‍💻 <b>Hint</b></summary>\n",
    "You can consider to use <code>numpy.sort</code> and use <code>matplotlib.pyplot.hist</code> to plot.\n",
    "\n",
    "</details>\n",
    "\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Normalized Plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pclass_xt = pd.crosstab(df_train[\"Pclass\"], df_train[\"Survived\"])\n",
    "\n",
    "# Normalize the cross tab to sum to 1:\n",
    "pclass_xt_pct = pclass_xt.____(pclass_xt.sum(1).astype(float), axis=0)\n",
    "\n",
    "pclass_xt_pct.____(kind=\"bar\", stacked=True, title=\"Survival Rate by Passenger Classes\")\n",
    "plt.xlabel(\"Passenger Class\")\n",
    "plt.ylabel(\"Survival Rate\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_survival_rate_by_gender(gender):\n",
    "    \"\"\"Computes the survival rate for a given gender.\n",
    "    \n",
    "    Args:\n",
    "        gender (string): gender for prediction of survival rate\n",
    "    \n",
    "    Returns:\n",
    "        a DataFrame containing the survival rate of passengers of the specified gender in each passenger class\n",
    "    \"\"\"\n",
    "    gender_df = df_train[df_train[\"Sex\"] == gender]\n",
    "    gender_xt = pd.crosstab(gender_df[\"Pclass\"], df_train[\"Survived\"])\n",
    "    gender_xt_pct = gender_xt.____(gender_xt.sum(1).astype(float), axis=0)\n",
    "    return gender_xt_pct\n",
    "\n",
    "\n",
    "# Plot survival rate by Sex\n",
    "gender_xt_pct = get_survival_rate_by_gender(\"female\")\n",
    "\n",
    "\n",
    "gender_xt_pct.____(\n",
    "    kind=\"bar\", stacked=True, title=\"Female Survival Rate by Passenger Class\"\n",
    ")\n",
    "plt.xlabel(\"Passenger Class\")\n",
    "plt.ylabel(\"Survival Rate\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot survival rate by Pclass\n",
    "gender_xt_pct = get_survival_rate_by_gender(\"male\")\n",
    "gender_xt_pct.____(\n",
    "    kind=\"bar\", stacked=True, title=\"Male Survival Rate by Passenger Class\"\n",
    ")\n",
    "plt.xlabel(\"Passenger Class\")\n",
    "plt.ylabel(\"Survival Rate\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "    \n",
    "<details><summary>👩‍💻 <b>Hint</b></summary>\n",
    "You can consider to use <code>pandas.DataFrame.div</code> and <code>pandas.DataFrame.plot</code>.\n",
    "\n",
    "</details>\n",
    "\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scatter Plots, subplots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set up a grid of plots\n",
    "fig, axes = plt.subplots(2, 1, figsize=figsize_with_subplots)\n",
    "\n",
    "\n",
    "def get_age_by_survived(df, survived):\n",
    "    \"\"\"Get passenger age from survived.\n",
    "    \n",
    "    Args:\n",
    "        df (DataFrame): a data structure\n",
    "        survived (int): a binary variable indicating whether age information is to be returned\n",
    "    \n",
    "    Returns:\n",
    "        a Pandas Series containing the age of all passengers with the specified survival status\n",
    "    \"\"\"\n",
    "    df = df[df[\"Survived\"] == survived][\"Age\"]\n",
    "    return df\n",
    "\n",
    "\n",
    "df1 = get_age_by_survived(df_train, 0)\n",
    "df2 = get_age_by_survived(df_train, 1)\n",
    "max_age = int(max(df_train[\"AgeFill\"]))\n",
    "\n",
    "axes[1].____([df1, df2], bins=int(max_age / 10), range=(1, max_age), stacked=True)\n",
    "axes[1].legend((\"Died\", \"Survived\"), loc=\"best\")\n",
    "axes[1].set_title(\"Survivors by Age Groups Histogram\")\n",
    "axes[1].set_xlabel(\"Age\")\n",
    "axes[1].set_ylabel(\"Count\")\n",
    "\n",
    "# Scatter plot Survived and AgeFill\n",
    "axes[0].____(df_train[\"Survived\"], df_train[\"AgeFill\"])\n",
    "axes[0].set_title(\"Survivors by Age Plot\")\n",
    "axes[0].set_xlabel(\"Survived\")\n",
    "axes[0].set_ylabel(\"Age\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "    \n",
    "<details><summary>👩‍💻 <b>Hint</b></summary>\n",
    "You can consider to use <code>matplotlib.axes.Axes.hist</code> and <code>matplotlib.axes.Axes.scatter</code>.\n",
    "\n",
    "</details>\n",
    "\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Kernel Density Estimation Plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the unique values of Pclass:\n",
    "def get_the_unique_values_of_Pclass(df):\n",
    "    \"\"\"Plot the column 'AgeFill' for each unique value of pclass in the input DataFrame.\n",
    "    \n",
    "    Args:\n",
    "        df (DataFrame): a data structure\n",
    "    \"\"\"\n",
    "    passenger_classes = np.____(df[\"Pclass\"].unique())\n",
    "    for pclass in passenger_classes:\n",
    "        df[\"AgeFill\"][df[\"Pclass\"] == pclass].____(kind=\"kde\")\n",
    "\n",
    "\n",
    "get_the_unique_values_of_Pclass(df_train)\n",
    "plt.title(\"Age Density Plot by Passenger Class\")\n",
    "plt.xlabel(\"Age\")\n",
    "plt.legend((\"1st Class\", \"2nd Class\", \"3rd Class\"), loc=\"best\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "    \n",
    "<details><summary>👩‍💻 <b>Hint</b></summary>\n",
    "You can consider to use <code>numpy.sort</code> and <code>pandas.DataFrame.plot</code>.<br>\n",
    "\n",
    "</details>\n",
    "\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Acknowledgments"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "Thanks to Donne Martin for creating the open-source project <a href=\"https://github.com/donnemartin/data-science-ipython-notebooks\">data-science-ipython-notebooks</a>, which inspires the majority of the content in this chapter."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "my_conda1",
   "language": "python",
   "name": "my_conda1"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "vscode": {
   "interpreter": {
    "hash": "41d25e42d6f40b10a4a523cb7c3fdf925a3f2162b0475594bd39b3950d44f5b4"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}