{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datascience import *\n",
    "import numpy as np\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plots\n",
    "plots.style.use('fivethirtyeight')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lecture 15 ##"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Random Sampling ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#United Airlines domestic flights \n",
    "# departing from San Francisco in the summer of 2015\n",
    "united = Table.read_table('united_summer2015.csv')\n",
    "united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "united"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Deterministic sample\n",
    "united.take(make_array(999, 1000, 1001))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Systematic sample\n",
    "start = np.random.choice(np.arange(1000))\n",
    "systematic_sample = united.take(np.arange(start, united.num_rows, 1000))\n",
    "systematic_sample.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Distributions ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Options for a six-side die\n",
    "die = Table().with_column('Face', np.arange(1, 7))\n",
    "die"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Table method .sample() draws at random with replacement \n",
    "# from the rows of a table (optional argument: with_replacement=False)\n",
    "die.sample(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Setup bins for the die options (middle of bin are integers)\n",
    "roll_bins = np.arange(0.5, 6.6, 1)\n",
    "roll_bins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Theoretical distribution (equal chance for each outcome)\n",
    "die.hist(bins = roll_bins)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Empirical distribution based on sample of size 10\n",
    "die.sample(10).hist(bins = roll_bins)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Large Random Samples ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Another empirical distribution based on sample of size 10\n",
    "die.sample(10).hist(bins = roll_bins)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#And another (distributions change quite a bit)\n",
    "die.sample(10).hist(bins = roll_bins)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Increase sample size to 100\n",
    "die.sample(100).hist(bins = roll_bins)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Another empirical  distribution with a sample size of 100\n",
    "#These appear more stable than with a sample size of 10\n",
    "die.sample(100).hist(bins = roll_bins)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Sample size of 1000\n",
    "die.sample(1000).hist(bins = roll_bins)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Another with a sample size of 1000\n",
    "#More stable than those with a sample size of 10 or 100\n",
    "#Closer to the theoretical distribution (equal probabilities)\n",
    "die.sample(1000).hist(bins = roll_bins)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Let's get back to the united data\n",
    "#Recall the column headers\n",
    "united.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Plot a histogram of the delays\n",
    "united.hist('Delay', bins = np.arange(-20, 201, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#We missed some values with the bins we specified\n",
    "min(united.column('Delay')), max(united.column('Delay'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Let's see which row had the 580 minute delay\n",
    "united.where('Delay', 580)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Check how many were above 200\n",
    "#We will focus on where the bulk of the data are and remove the 0.8% large values for now\n",
    "united.where('Delay', are.above(200)).num_rows/united.num_rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#united included all the domestic flights during the summer 2015 departing from San Francisco \n",
    "#think of united as a population\n",
    "#now we will draw samples from it and view the corresponding empirical distributions\n",
    "united.sample(10).hist('Delay', bins = np.arange(-20, 201, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Another empirical distribution with a sample size of 10\n",
    "united.sample(10).hist('Delay', bins = np.arange(-20, 201, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Empirical distribution with a sample size of 100\n",
    "united.sample(100).hist('Delay', bins = np.arange(-20, 201, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Another empirical distribution with a sample size of 100\n",
    "united.sample(100).hist('Delay', bins = np.arange(-20, 201, 10))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Simulating Statistics ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Considering united as a population\n",
    "#The population median delay is:\n",
    "np.median(united.column('Delay'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Percentage of data less than or equal to the median\n",
    "united.where('Delay', are.below_or_equal_to(2)).num_rows / united.num_rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#What if we take a random sample of size 10 - what's the estimated median?\n",
    "np.median(united.sample(10).column('Delay'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Simulate the empirical distribution of the median (statistic) using a sample size of 1000\n",
    "#We generate 10000 samples of size 1000 (there will be 10000 estimates of the median)\n",
    "#This cell takes a few seconds to run\n",
    "medians = make_array()\n",
    "\n",
    "for i in np.arange(10000):\n",
    "    new_median = np.median(united.sample(1000).column('Delay'))\n",
    "    medians = np.append(medians, new_median)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Display the empirical distribution of the median as a histogram\n",
    "Table().with_column('Sample Median', medians).hist(bins = np.arange(-0.5, 5.6, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}