{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Single Variable Visualizations #\n", "\n", "## (Graphs) ##\n", "\n", "In this notebook we will cover more thoroughly than before:\n", "\n", "- How to make and interpret a bar graph or bar chart\n", "- How to make and interpret a histogram\n", "- The fundamental differences between bar charts and histograms\n", "- How to make and interpret boxplots\n", "- When possible how to make side-by-side or overlapping graphs to compare two groups\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')\n", "plots.rcParams[\"patch.force_edgecolor\"] = True" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Categorical Distribution ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#help(Table.hist)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Bar Charts ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_movies = Table.read_table('top_movies_2017.csv')\n", "top_movies" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_movies = top_movies.with_column('Millions', np.round(top_movies.column('Gross')/1000000,3))\n", "top_movies.take(np.arange(10)).barh('Title', 'Millions')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "When the bars of a bar graph are arranged from longest to shortest, that is called a *Pareto* chart." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_movies.take(np.arange(10)).sort(\"Millions\", descending=True).barh('Title', 'Millions')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "studios = top_movies.select('Studio')\n", "studios" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "studio_distribution = studios.group('Studio')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "studio_distribution" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sum(studio_distribution.column('count'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "studio_distribution.barh('Studio')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Pareto\n", "\n", "studio_distribution.sort('count', descending=True).barh('Studio')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "baby = Table.read_table(\"baby.csv\")\n", "baby" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "baby.group(\"Maternal Smoker\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "baby.group(\"Maternal Smoker\").barh(\"Maternal Smoker\")\n", "plots.xlabel(\"Count\", color = \"blue\")\n", "plots.title(\"Counts of Mothers in \\nBaby Dataset\", color = \"blue\")\n", "plots.yticks(make_array(.5, 1.5), make_array(\"Yes\", \"No\"), color = \"gold\")\n", "plots.ylabel(\"Smoker\", color =\"blue\", size = 25);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "help(plots)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Numerical Distribution ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ages = 2022 - top_movies.column('Year')\n", "top_movies = top_movies.with_column('Age', ages)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_movies" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Binning ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "min(ages), max(ages)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "my_bins = make_array(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "binned_data = top_movies.bin('Age', bins = my_bins)\n", "binned_data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sum(binned_data.column('Age count'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_movies.bin('Age', bins = np.arange(0, 126, 25))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_movies.bin('Age', bins = np.arange(0, 60, 25))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_movies.where('Age', 52)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Histograms ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "my_bins" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "binned_data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Let's make our first histogram!\n", "top_movies.hist('Age')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Let's try picking our own bins instead.\n", "top_movies.hist('Age', bins = np.arange(0, 100, 10), unit = 'Year')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_movies.hist('Age', bins = np.arange(0, 110, 10), unit = 'Year', density=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "25/2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Let's try not specifying any bins!\n", "top_movies.hist('Age', unit='Year')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "3*9.61+5\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Add a column containing what percent of movies are in each bin\n", "binned_data = binned_data.with_column(\n", " 'Percent', 100*binned_data.column('Age count')/200)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "binned_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Height ##\n", "\n", "### Question: What is the height of the [42, 66] bin?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Step 1: Calculate % of movies in the [40, 65) bin\n", "percent = binned_data.where('bin', 42).column('Percent').item(0)\n", "percent" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Step 2: Calculate the width of the 42-66 bin\n", "width = 66 - 42\n", "width" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Step 3: Area of rectangle = height * width\n", "# --> height = percent / width\n", "height = percent / width\n", "height" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### What are the heights of the rest of the bins?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get the bin lefts\n", "bin_lefts = binned_data.take(np.arange(binned_data.num_rows - 1))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get the bin widths\n", "bin_widths = np.diff(binned_data.column('bin'))\n", "bin_lefts = bin_lefts.with_column('Width', bin_widths)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get the bin heights\n", "bin_heights = bin_lefts.column('Percent') / bin_widths\n", "bin_lefts = bin_lefts.with_column('Height', bin_heights)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bin_lefts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_movies.hist('Age', bins = my_bins, unit = 'Year')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Please note that the example above is a bad example. On a histogram, bins should be the **same** size. Now, let's look at another example showing the impact of bins. \n", "\n", "### The impact of bins ###" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "survey = Table.read_table('welcome_survey_v1.csv')\n", "survey" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sleep_bins = np.arange(4, 12, 0.5)\n", "sleep_bins" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "survey.hist('Hours of sleep', bins=sleep_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sleep_bins = np.arange(4,12, 1)\n", "survey.hist('Hours of sleep', bins=sleep_bins)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Boxplots\n", "\n", "Box plots are a graphical display of the descriptive statistics that we know as the Five Number Summary.\n", "\n", "Five Number Summary is as follows\n", "\n", "- The minimum\n", "- The $25^{th}$ percentile, $Q_1$\n", "- The $50^{th}$ percentile, the median\n", "- The $75^{th}$ percentile, $Q_3$\n", "- The maximum\n", "\n", "In one of the next couple of notebooks, we will discuss how to compute these. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "skyscrapers = Table.read_table('skyscrapers.csv')\n", "ny = skyscrapers.where('city', \"New York City\")\n", "la = skyscrapers.where('city', 'Los Angeles')\n", "ny" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plots.boxplot(ny.column(3), widths = 0.4);\n", "#plots.xlabel(\"NYC\");" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plots.boxplot(ny.column(3), widths=.5 )\n", "plots.xticks(make_array(1), make_array(\"NYC\"))\n", "plots.title(\"Heights of New York City \\nSkyscrapers\");" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Can we also display the mean on a boxplot? ###\n", "\n", "Yes, if you set the value of **showmeans** equal to True. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plots.boxplot(ny.column(3), widths=.5 , showmeans=True)\n", "plots.xticks(make_array(1), make_array(\"NYC\"));" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plots.boxplot(ny.column(3), widths=.5 , showmeans=True, whis=10)\n", "plots.xticks(make_array(1), make_array(\"NYC\"));" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Comparing two groups numerically ##\n", "\n", "We can use side-by-side boxplots to visually compare two groups." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ticks=make_array(2.5, 3.5)\n", "labels=make_array(\"NYC\", \"LA\")\n", "\n", "plots.figure(figsize=(6, 6))\n", "plots.boxplot(ny.column(3), widths=.5, positions=make_array(ticks.item(0)) )\n", "plots.boxplot(la.column(3), widths=.5, positions=make_array(ticks.item(1)) )\n", "plots.xticks(ticks, labels)\n", "plots.title(\"Boxplots Comparing NYC and LA Skyscrapers\");" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ticks=make_array(2.5, 3.5)\n", "labels=make_array(\"NYC\", \"LA\")\n", "\n", "plots.figure(figsize=(6, 6))\n", "plots.boxplot(ny.column(3), widths=.5, positions=make_array(ticks.item(0)), showmeans=True)\n", "plots.boxplot(la.column(3), widths=.5, positions=make_array(ticks.item(1)), showmeans=True)\n", "plots.xticks(ticks, labels)\n", "plots.title(\"Boxplots Comparing NYC and LA Skyscrapers\");" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "smokers = baby.where(\"Maternal Smoker\", True).column(\"Birth Weight\")\n", "nonsmokers = baby.where(\"Maternal Smoker\", False).column(\"Birth Weight\")\n", "\n", "\n", "ticks=make_array(2.5, 3.5)\n", "labels=make_array(\"Smokers\", \"Non-Smokers\")\n", "\n", "plots.figure(figsize=(6, 6))\n", "plots.boxplot(smokers, widths=.5, positions=make_array(ticks.item(0)), showmeans=True)\n", "plots.boxplot(nonsmokers, widths=.5, positions=make_array(ticks.item(1)), showmeans=True)\n", "plots.xticks(ticks, labels)\n", "plots.title(\"Comparing Birth Weights \\nof Babies\");" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sometimes, overlapping histograms are also possible. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "galton = Table.read_table('galton.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "galton" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "galton.hist('midparentHeight')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "galton.hist('childHeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What part of this histogram is the children with heights above 70 inches?\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "galton.hist('childHeight', left_end=70)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "galton.hist('midparentHeight', 'childHeight')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "galton.hist(\"mother\", \"father\", \"childHeight\", \"midparentHeight\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ticks=make_array(2.5, 3.5, 4.5, 5.5)\n", "labels=make_array(\"Midparent\", \"Child\", \"Mother\", \"Father\")\n", "\n", "plots.figure(figsize=(6, 6))\n", "plots.boxplot(galton.column(\"midparentHeight\"), widths=.5, positions=make_array(ticks.item(0)), showmeans=True)\n", "plots.boxplot(galton.column(\"childHeight\"), widths=.5, positions=make_array(ticks.item(1)), showmeans=True)\n", "plots.boxplot(galton.column(\"mother\"), widths=.5, positions=make_array(ticks.item(2)), showmeans=True)\n", "plots.boxplot(galton.column(\"father\"), widths=.5, positions=make_array(ticks.item(3)), showmeans=True)\n", "plots.xticks(ticks, labels)\n", "plots.title(\"Comparing Heights\");" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" } }, "nbformat": 4, "nbformat_minor": 1 }