{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib\n", "from datascience import *\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "import numpy as np\n", "plots.style.use('fivethirtyeight')\n", "import warnings\n", "warnings.simplefilter(action=\"ignore\", category=FutureWarning)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lecture 26 ##" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Chebyshev's Bounds ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "births = Table.read_table('baby.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "births.labels" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "births.drop(5).hist(overlay = False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mpw = births.column('Maternal Pregnancy Weight')\n", "mean = np.mean(mpw)\n", "sd = np.std(mpw)\n", "mean, sd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "within_3_SDs = births.where('Maternal Pregnancy Weight', are.between(mean - 3*sd, mean + 3*sd))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "within_3_SDs.num_rows/births.num_rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "1 - 1/3**2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# See if Chebyshev's bounds work for different distributions\n", "\n", "for k in births.labels:\n", " values = births.column(k)\n", " mean = np.mean(values)\n", " sd = np.std(values)\n", " print()\n", " print(k)\n", " for z in np.arange(2, 6):\n", " chosen = births.where(k, are.between(mean - z*sd, mean + z*sd))\n", " proportion = chosen.num_rows/births.num_rows\n", " percent = round(proportion * 100, 2)\n", " print('Average plus or minus', z, 'SDs:', percent, '%')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Standard Units ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def standard_units(x):\n", " \"\"\"Convert array x to standard units.\"\"\"\n", " return (x - np.mean(x))/np.std(x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ages = births.column('Maternal Age')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ages_standard_units = standard_units(ages)\n", "ages_standard_units" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.mean(ages_standard_units), np.std(ages_standard_units)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "both = Table().with_columns(\n", " 'Age in Years', ages,\n", " 'Age in Standard Units', ages_standard_units\n", ")\n", "both\n", "#####################" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.mean(ages), np.std(ages)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "both.hist('Age in Years', bins = np.arange(15, 46, 2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "both.hist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))\n", "plots.xlim(-2, 3.1);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## The SD and Bell Shaped Curves ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights = births.column('Maternal Height')\n", "np.mean(heights), np.std(heights)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "births.hist('Birth Weight')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bw = births.column('Birth Weight')\n", "mean_w = np.mean(bw)\n", "sd_w = np.std(bw)\n", "mean_w, sd_w" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## The normal curve ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Roulette: 38 pockets\n", "# bets on red pays 1 to 1\n", "red_winnings = np.append(1*np.ones(18), -1*np.ones(20))\n", "red = Table().with_columns('Winnings on Red', red_winnings)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "red.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "red.hist(bins = np.arange(-1.5, 1.6, 1))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "18/38 #chance of making $1 from bet placed on red" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "num_bets = 100 #different spins of the roulette with bets on red\n", "\n", "net_gains = make_array() #amount won from num_bets on red\n", "\n", "for i in np.arange(20000):\n", " spins = red.sample(num_bets)\n", " new_net_gain = sum(spins.column('Winnings on Red'))\n", " net_gains = np.append(net_gains, new_net_gain)\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Bell-shaped histogram \n", "#(even though original distribution was far from bell-shaped)\n", "Table().with_columns('Net Gain', net_gains).hist()\n", "plots.xticks(np.arange(-45, 36, 10));" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.average(net_gains)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.std(net_gains)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Central Limit Theorem and Simulating Sample Mean ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "united = Table.read_table('united_summer2015.csv')\n", "united" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "united.hist('Delay', bins = np.arange(-20, 300, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "delays = united.column('Delay')\n", "mean_delay = np.mean(delays)\n", "sd_delay = np.std(delays)\n", "\n", "mean_delay, sd_delay" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "united = united.with_columns(\n", " 'Delay in Standard Units', standard_units(delays)\n", ")\n", "united.sort('Delay', descending=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Chebychev: at least 89% within 3 SD\n", "chosen = united.where('Delay in Standard Units', are.between(-3, 3))\n", "chosen.num_rows/united.num_rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "united.hist('Delay', bins = np.arange(-20, 300, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Take random sample from population of size sample_size \n", "#Repeat to get empirical distribution of sample average\n", "sample_size = 400\n", "\n", "means = make_array()\n", "\n", "for i in np.arange(10000):\n", " sampled_flights = united.sample(sample_size)\n", " sample_mean = np.mean(sampled_flights.column('Delay'))\n", " means = np.append(means, sample_mean)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Table().with_columns('Sample Mean', means).hist(bins = 20)\n", "plots.title('Sample Means: Sample Size ' + str(sample_size))\n", "plots.xlabel('Random Sample Mean');" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.mean(means), np.std(means)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mean_delay, sd_delay" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sd_delay/np.sqrt(sample_size)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 1 }