{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Single Variable Visualizations #\n",
    "\n",
    "## (Graphs) ##\n",
    "\n",
    "In this notebook we will cover more thoroughly than before:\n",
    "\n",
    "- How to make and interpret a bar graph or bar chart\n",
    "- How to make and interpret a histogram\n",
    "- The fundamental differences between bar charts and histograms\n",
    "- How to make and interpret boxplots\n",
    "- When possible how to make side-by-side or overlapping graphs to compare <u>two groups</u>\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datascience import *\n",
    "import numpy as np\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plots\n",
    "plots.style.use('fivethirtyeight')\n",
    "plots.rcParams[\"patch.force_edgecolor\"] = True"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Categorical Distribution ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#help(Table.hist)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Bar Charts ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_movies = Table.read_table('top_movies_2017.csv')\n",
    "top_movies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_movies = top_movies.with_column('Millions', np.round(top_movies.column('Gross')/1000000,3))\n",
    "top_movies.take(np.arange(10)).barh('Title', 'Millions')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When the bars of a bar graph are arranged from longest to shortest, that is called a *Pareto* chart."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_movies.take(np.arange(10)).sort(\"Millions\", descending=True).barh('Title', 'Millions')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "studios = top_movies.select('Studio')\n",
    "studios"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "studio_distribution = studios.group('Studio')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "studio_distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sum(studio_distribution.column('count'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "studio_distribution.barh('Studio')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Pareto\n",
    "\n",
    "studio_distribution.sort('count', descending=True).barh('Studio')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "baby = Table.read_table(\"baby.csv\")\n",
    "baby"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "baby.group(\"Maternal Smoker\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "baby.group(\"Maternal Smoker\").barh(\"Maternal Smoker\")\n",
    "plots.xlabel(\"Count\", color = \"blue\")\n",
    "plots.title(\"Counts of Mothers in \\nBaby Dataset\", color = \"blue\")\n",
    "plots.yticks(make_array(.5, 1.5), make_array(\"Yes\", \"No\"), color = \"gold\")\n",
    "plots.ylabel(\"Smoker\", color =\"blue\", size = 25);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "help(plots)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Numerical Distribution ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ages = 2022 - top_movies.column('Year')\n",
    "top_movies = top_movies.with_column('Age', ages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_movies"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Binning ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "min(ages), max(ages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_bins = make_array(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "binned_data = top_movies.bin('Age', bins = my_bins)\n",
    "binned_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sum(binned_data.column('Age count'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_movies.bin('Age', bins = np.arange(0, 126, 25))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_movies.bin('Age', bins = np.arange(0, 60, 25))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_movies.where('Age', 52)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Histograms ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_bins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "binned_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's make our first histogram!\n",
    "top_movies.hist('Age')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's try picking our own bins instead.\n",
    "top_movies.hist('Age', bins = np.arange(0, 100, 10),  unit = 'Year')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_movies.hist('Age', bins = np.arange(0, 110, 10), unit = 'Year', density=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "25/2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's try not specifying any bins!\n",
    "top_movies.hist('Age', unit='Year')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "3*9.61+5\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add a column containing what percent of movies are in each bin\n",
    "binned_data = binned_data.with_column(\n",
    "    'Percent', 100*binned_data.column('Age count')/200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "binned_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Height ##\n",
    "\n",
    "### Question: What is the height of the [42, 66] bin?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 1: Calculate % of movies in the [40, 65) bin\n",
    "percent = binned_data.where('bin', 42).column('Percent').item(0)\n",
    "percent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 2: Calculate the width of the 42-66 bin\n",
    "width = 66 - 42\n",
    "width"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 3: Area of rectangle = height * width\n",
    "#         --> height = percent / width\n",
    "height = percent / width\n",
    "height"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### What are the heights of the rest of the bins?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the bin lefts\n",
    "bin_lefts = binned_data.take(np.arange(binned_data.num_rows - 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the bin widths\n",
    "bin_widths = np.diff(binned_data.column('bin'))\n",
    "bin_lefts = bin_lefts.with_column('Width', bin_widths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the bin heights\n",
    "bin_heights = bin_lefts.column('Percent') / bin_widths\n",
    "bin_lefts = bin_lefts.with_column('Height', bin_heights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bin_lefts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_movies.hist('Age', bins = my_bins, unit = 'Year')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Please note that the example above is a bad example.  On a histogram, bins should be the **same** size.  Now, let's look at another example showing the impact of bins.  \n",
    "\n",
    "### The impact of bins ###"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "survey = Table.read_table('welcome_survey_v1.csv')\n",
    "survey"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sleep_bins = np.arange(4, 12, 0.5)\n",
    "sleep_bins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "survey.hist('Hours of sleep', bins=sleep_bins)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sleep_bins = np.arange(4,12, 1)\n",
    "survey.hist('Hours of sleep', bins=sleep_bins)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Boxplots\n",
    "\n",
    "Box plots are a graphical display of the descriptive statistics that we know as the Five Number Summary.\n",
    "\n",
    "Five Number Summary is as follows\n",
    "\n",
    "- The minimum\n",
    "- The $25^{th}$ percentile, $Q_1$\n",
    "- The $50^{th}$ percentile, the median\n",
    "- The $75^{th}$ percentile, $Q_3$\n",
    "- The maximum\n",
    "\n",
    "In one of the next couple of notebooks, we will discuss how to compute these.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "skyscrapers = Table.read_table('skyscrapers.csv')\n",
    "ny = skyscrapers.where('city', \"New York City\")\n",
    "la = skyscrapers.where('city', 'Los Angeles')\n",
    "ny"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plots.boxplot(ny.column(3), widths = 0.4);\n",
    "#plots.xlabel(\"NYC\");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plots.boxplot(ny.column(3), widths=.5 )\n",
    "plots.xticks(make_array(1), make_array(\"NYC\"))\n",
    "plots.title(\"Heights of New York City \\nSkyscrapers\");"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Can we also display the mean on a boxplot? ###\n",
    "\n",
    "Yes, if you set the value of **showmeans** equal to True.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plots.boxplot(ny.column(3), widths=.5 , showmeans=True)\n",
    "plots.xticks(make_array(1), make_array(\"NYC\"));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plots.boxplot(ny.column(3), widths=.5 , showmeans=True, whis=10)\n",
    "plots.xticks(make_array(1), make_array(\"NYC\"));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Comparing two groups numerically ##\n",
    "\n",
    "We can use side-by-side boxplots to visually compare two groups."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ticks=make_array(2.5, 3.5)\n",
    "labels=make_array(\"NYC\", \"LA\")\n",
    "\n",
    "plots.figure(figsize=(6, 6))\n",
    "plots.boxplot(ny.column(3), widths=.5, positions=make_array(ticks.item(0)) )\n",
    "plots.boxplot(la.column(3), widths=.5, positions=make_array(ticks.item(1)) )\n",
    "plots.xticks(ticks, labels)\n",
    "plots.title(\"Boxplots Comparing NYC and LA Skyscrapers\");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ticks=make_array(2.5, 3.5)\n",
    "labels=make_array(\"NYC\", \"LA\")\n",
    "\n",
    "plots.figure(figsize=(6, 6))\n",
    "plots.boxplot(ny.column(3), widths=.5, positions=make_array(ticks.item(0)), showmeans=True)\n",
    "plots.boxplot(la.column(3), widths=.5, positions=make_array(ticks.item(1)), showmeans=True)\n",
    "plots.xticks(ticks, labels)\n",
    "plots.title(\"Boxplots Comparing NYC and LA Skyscrapers\");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "smokers = baby.where(\"Maternal Smoker\", True).column(\"Birth Weight\")\n",
    "nonsmokers = baby.where(\"Maternal Smoker\", False).column(\"Birth Weight\")\n",
    "\n",
    "\n",
    "ticks=make_array(2.5, 3.5)\n",
    "labels=make_array(\"Smokers\", \"Non-Smokers\")\n",
    "\n",
    "plots.figure(figsize=(6, 6))\n",
    "plots.boxplot(smokers, widths=.5, positions=make_array(ticks.item(0)), showmeans=True)\n",
    "plots.boxplot(nonsmokers, widths=.5, positions=make_array(ticks.item(1)), showmeans=True)\n",
    "plots.xticks(ticks, labels)\n",
    "plots.title(\"Comparing Birth Weights \\nof Babies\");"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Sometimes, overlapping histograms are also possible.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "galton = Table.read_table('galton.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "galton"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "galton.hist('midparentHeight')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "galton.hist('childHeight')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "What part of this histogram is the children with heights above 70 inches?\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "galton.hist('childHeight', left_end=70)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "galton.hist('midparentHeight', 'childHeight')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "galton.hist(\"mother\", \"father\", \"childHeight\", \"midparentHeight\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ticks=make_array(2.5, 3.5, 4.5, 5.5)\n",
    "labels=make_array(\"Midparent\", \"Child\", \"Mother\", \"Father\")\n",
    "\n",
    "plots.figure(figsize=(6, 6))\n",
    "plots.boxplot(galton.column(\"midparentHeight\"), widths=.5, positions=make_array(ticks.item(0)), showmeans=True)\n",
    "plots.boxplot(galton.column(\"childHeight\"), widths=.5, positions=make_array(ticks.item(1)), showmeans=True)\n",
    "plots.boxplot(galton.column(\"mother\"), widths=.5, positions=make_array(ticks.item(2)), showmeans=True)\n",
    "plots.boxplot(galton.column(\"father\"), widths=.5, positions=make_array(ticks.item(3)), showmeans=True)\n",
    "plots.xticks(ticks, labels)\n",
    "plots.title(\"Comparing Heights\");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}