{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib\n",
    "from datascience import *\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plots\n",
    "import numpy as np\n",
    "plots.style.use('fivethirtyeight')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lecture 27 ##"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Central Limit Theorem ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "united = Table.read_table('united_summer2015.csv')\n",
    "united"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "united.hist('Delay', bins = np.arange(-20, 300, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "delays = united.column('Delay')\n",
    "mean_delay = np.mean(delays)\n",
    "sd_delay = np.std(delays)\n",
    "\n",
    "mean_delay, sd_delay"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "percentile(50, delays)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_size = 400\n",
    "\n",
    "means_400 = make_array()\n",
    "\n",
    "for i in np.arange(10000):\n",
    "    sampled_flights = united.sample(sample_size)\n",
    "    sample_mean = np.mean(sampled_flights.column('Delay'))\n",
    "    means_400 = np.append(means_400, sample_mean)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Table().with_columns('Sample Mean', means_400).hist(bins = 20)\n",
    "\n",
    "plots.title('Sample Size ' + str(sample_size))\n",
    "plots.xlabel('Sample Average')\n",
    "print('Population Average: ', mean_delay);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.average(means_400)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Variability of the Sample Average ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_size = 900\n",
    "\n",
    "means_900 = make_array()\n",
    "\n",
    "for i in np.arange(10000):\n",
    "    sampled_flights = united.sample(sample_size)\n",
    "    sample_mean = np.mean(sampled_flights.column('Delay'))\n",
    "    means_900 = np.append(means_900, sample_mean)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "means_tbl = Table().with_columns(\n",
    "    '400', means_400,\n",
    "    '900', means_900\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "means_tbl.hist(bins = np.arange(5, 31, 0.5))\n",
    "plots.title('Distribution of Sample Average');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#####################################################\n",
    "\"\"\"Empirical distribution of random sample means\"\"\"\n",
    "\n",
    "def sample_means(sample_size):\n",
    "    \n",
    "    repetitions = 10000\n",
    "    means = make_array()\n",
    "\n",
    "    for i in range(repetitions):\n",
    "        sampled_flights = united.sample(sample_size)\n",
    "        sample_mean = np.mean(sampled_flights.column('Delay'))\n",
    "        means = np.append(means, sample_mean)\n",
    "\n",
    "    sample_means = Table().with_column('Sample Means', means)\n",
    "    \n",
    "    # Display empirical histogram and print all relevant quantities\n",
    "    sample_means.hist(bins=20)\n",
    "    plots.xlabel('Sample Means')\n",
    "    plots.title('Sample Size ' + str(sample_size))\n",
    "    print(\"Sample size: \", sample_size)\n",
    "    print(\"Population mean:\", np.mean(united.column('Delay')))\n",
    "    print(\"Average of sample means: \", np.mean(means))\n",
    "    print(\"Population SD:\", np.std(united.column('Delay')))\n",
    "    print(\"SD of sample means:\", np.std(means))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_means(100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_means(400)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_means(625)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sd_delay, sd_delay / make_array(10, 20, 25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_sizes = np.arange(50, 401, 50)\n",
    "\n",
    "sd_of_sample_means = make_array()\n",
    "\n",
    "for n in sample_sizes:\n",
    "    means = make_array()\n",
    "    for i in np.arange(10000):\n",
    "        means = np.append(means, np.mean(united.sample(n).column('Delay')))\n",
    "    sd_of_sample_means = np.append(sd_of_sample_means, np.std(means))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sd_comparison = Table().with_columns(\n",
    "    'Sample Size n', sample_sizes,\n",
    "    'SD of 10,000 Sample Means', sd_of_sample_means,\n",
    "    'Population_SD/sqrt(n)', sd_delay/np.sqrt(sample_sizes)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sd_comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sd_comparison.scatter('Sample Size n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}