{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib\n", "from datascience import *\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "import numpy as np\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lecture 27 ##" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Central Limit Theorem ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "united = Table.read_table('united_summer2015.csv')\n", "united" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "united.hist('Delay', bins = np.arange(-20, 300, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "delays = united.column('Delay')\n", "mean_delay = np.mean(delays)\n", "sd_delay = np.std(delays)\n", "\n", "mean_delay, sd_delay" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "percentile(50, delays)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_size = 400\n", "\n", "means_400 = make_array()\n", "\n", "for i in np.arange(10000):\n", " sampled_flights = united.sample(sample_size)\n", " sample_mean = np.mean(sampled_flights.column('Delay'))\n", " means_400 = np.append(means_400, sample_mean)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Table().with_columns('Sample Mean', means_400).hist(bins = 20)\n", "\n", "plots.title('Sample Size ' + str(sample_size))\n", "plots.xlabel('Sample Average')\n", "print('Population Average: ', mean_delay);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.average(means_400)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Variability of the Sample Average ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_size = 900\n", "\n", "means_900 = make_array()\n", "\n", "for i in np.arange(10000):\n", " sampled_flights = united.sample(sample_size)\n", " sample_mean = np.mean(sampled_flights.column('Delay'))\n", " means_900 = np.append(means_900, sample_mean)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "means_tbl = Table().with_columns(\n", " '400', means_400,\n", " '900', means_900\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "means_tbl.hist(bins = np.arange(5, 31, 0.5))\n", "plots.title('Distribution of Sample Average');" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#####################################################\n", "\"\"\"Empirical distribution of random sample means\"\"\"\n", "\n", "def sample_means(sample_size):\n", " \n", " repetitions = 10000\n", " means = make_array()\n", "\n", " for i in range(repetitions):\n", " sampled_flights = united.sample(sample_size)\n", " sample_mean = np.mean(sampled_flights.column('Delay'))\n", " means = np.append(means, sample_mean)\n", "\n", " sample_means = Table().with_column('Sample Means', means)\n", " \n", " # Display empirical histogram and print all relevant quantities\n", " sample_means.hist(bins=20)\n", " plots.xlabel('Sample Means')\n", " plots.title('Sample Size ' + str(sample_size))\n", " print(\"Sample size: \", sample_size)\n", " print(\"Population mean:\", np.mean(united.column('Delay')))\n", " print(\"Average of sample means: \", np.mean(means))\n", " print(\"Population SD:\", np.std(united.column('Delay')))\n", " print(\"SD of sample means:\", np.std(means))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_means(100)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_means(400)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_means(625)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sd_delay, sd_delay / make_array(10, 20, 25)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_sizes = np.arange(50, 401, 50)\n", "\n", "sd_of_sample_means = make_array()\n", "\n", "for n in sample_sizes:\n", " means = make_array()\n", " for i in np.arange(10000):\n", " means = np.append(means, np.mean(united.sample(n).column('Delay')))\n", " sd_of_sample_means = np.append(sd_of_sample_means, np.std(means))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sd_comparison = Table().with_columns(\n", " 'Sample Size n', sample_sizes,\n", " 'SD of 10,000 Sample Means', sd_of_sample_means,\n", " 'Population_SD/sqrt(n)', sd_delay/np.sqrt(sample_sizes)\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sd_comparison" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sd_comparison.scatter('Sample Size n')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 1 }