{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib\n",
    "from datascience import *\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plots\n",
    "import numpy as np\n",
    "plots.style.use('fivethirtyeight')\n",
    "import warnings\n",
    "warnings.simplefilter(action=\"ignore\", category=FutureWarning)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lecture 26 ##"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Chebyshev's Bounds ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "births = Table.read_table('baby.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "births.labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "births.drop(5).hist(overlay = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mpw = births.column('Maternal Pregnancy Weight')\n",
    "mean = np.mean(mpw)\n",
    "sd = np.std(mpw)\n",
    "mean, sd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "within_3_SDs = births.where('Maternal Pregnancy Weight', are.between(mean - 3*sd, mean + 3*sd))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "within_3_SDs.num_rows/births.num_rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "1 - 1/3**2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# See if Chebyshev's bounds work for different distributions\n",
    "\n",
    "for k in births.labels:\n",
    "    values = births.column(k)\n",
    "    mean = np.mean(values)\n",
    "    sd = np.std(values)\n",
    "    print()\n",
    "    print(k)\n",
    "    for z in np.arange(2, 6):\n",
    "        chosen = births.where(k, are.between(mean - z*sd, mean + z*sd))\n",
    "        proportion = chosen.num_rows/births.num_rows\n",
    "        percent = round(proportion * 100, 2)\n",
    "        print('Average plus or minus', z, 'SDs:', percent, '%')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Standard Units ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def standard_units(x):\n",
    "    \"\"\"Convert array x to standard units.\"\"\"\n",
    "    return (x - np.mean(x))/np.std(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ages = births.column('Maternal Age')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ages_standard_units = standard_units(ages)\n",
    "ages_standard_units"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(ages_standard_units), np.std(ages_standard_units)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "both = Table().with_columns(\n",
    "    'Age in Years', ages,\n",
    "    'Age in Standard Units', ages_standard_units\n",
    ")\n",
    "both\n",
    "#####################"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(ages), np.std(ages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "both.hist('Age in Years', bins = np.arange(15, 46, 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "both.hist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))\n",
    "plots.xlim(-2, 3.1);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The SD and Bell Shaped Curves ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "heights = births.column('Maternal Height')\n",
    "np.mean(heights), np.std(heights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "births.hist('Birth Weight')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bw = births.column('Birth Weight')\n",
    "mean_w = np.mean(bw)\n",
    "sd_w = np.std(bw)\n",
    "mean_w, sd_w"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## The normal curve ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Roulette:  38 pockets\n",
    "# bets on red pays 1 to 1\n",
    "red_winnings = np.append(1*np.ones(18), -1*np.ones(20))\n",
    "red = Table().with_columns('Winnings on Red', red_winnings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "red.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "red.hist(bins = np.arange(-1.5, 1.6, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "18/38 #chance of making $1 from bet placed on red"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_bets = 100 #different spins of the roulette with bets on red\n",
    "\n",
    "net_gains = make_array() #amount won from num_bets on red\n",
    "\n",
    "for i in np.arange(20000):\n",
    "    spins = red.sample(num_bets)\n",
    "    new_net_gain = sum(spins.column('Winnings on Red'))\n",
    "    net_gains = np.append(net_gains, new_net_gain)\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Bell-shaped histogram \n",
    "#(even though original distribution was far from bell-shaped)\n",
    "Table().with_columns('Net Gain', net_gains).hist()\n",
    "plots.xticks(np.arange(-45, 36, 10));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.average(net_gains)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.std(net_gains)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Central Limit Theorem and Simulating Sample Mean ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "united = Table.read_table('united_summer2015.csv')\n",
    "united"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "united.hist('Delay', bins = np.arange(-20, 300, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "delays = united.column('Delay')\n",
    "mean_delay = np.mean(delays)\n",
    "sd_delay = np.std(delays)\n",
    "\n",
    "mean_delay, sd_delay"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "united = united.with_columns(\n",
    "    'Delay in Standard Units', standard_units(delays)\n",
    ")\n",
    "united.sort('Delay', descending=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Chebychev: at least 89% within 3 SD\n",
    "chosen = united.where('Delay in Standard Units', are.between(-3, 3))\n",
    "chosen.num_rows/united.num_rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "united.hist('Delay', bins = np.arange(-20, 300, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Take random sample from population of size sample_size \n",
    "#Repeat to get empirical distribution of sample average\n",
    "sample_size = 400\n",
    "\n",
    "means = make_array()\n",
    "\n",
    "for i in np.arange(10000):\n",
    "    sampled_flights = united.sample(sample_size)\n",
    "    sample_mean = np.mean(sampled_flights.column('Delay'))\n",
    "    means = np.append(means, sample_mean)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Table().with_columns('Sample Mean', means).hist(bins = 20)\n",
    "plots.title('Sample Means: Sample Size ' + str(sample_size))\n",
    "plots.xlabel('Random Sample Mean');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(means), np.std(means)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean_delay, sd_delay"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sd_delay/np.sqrt(sample_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}