{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lecture 15 ##" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Random Sampling ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#United Airlines domestic flights \n", "# departing from San Francisco in the summer of 2015\n", "united = Table.read_table('united_summer2015.csv')\n", "united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "united" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Deterministic sample\n", "united.take(make_array(999, 1000, 1001))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Systematic sample\n", "start = np.random.choice(np.arange(1000))\n", "systematic_sample = united.take(np.arange(start, united.num_rows, 1000))\n", "systematic_sample.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Distributions ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Options for a six-side die\n", "die = Table().with_column('Face', np.arange(1, 7))\n", "die" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Table method .sample() draws at random with replacement \n", "# from the rows of a table (optional argument: with_replacement=False)\n", "die.sample(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Setup bins for the die options (middle of bin are integers)\n", "roll_bins = np.arange(0.5, 6.6, 1)\n", "roll_bins" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Theoretical distribution (equal chance for each outcome)\n", "die.hist(bins = roll_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Empirical distribution based on sample of size 10\n", "die.sample(10).hist(bins = roll_bins)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Large Random Samples ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Another empirical distribution based on sample of size 10\n", "die.sample(10).hist(bins = roll_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#And another (distributions change quite a bit)\n", "die.sample(10).hist(bins = roll_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Increase sample size to 100\n", "die.sample(100).hist(bins = roll_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Another empirical distribution with a sample size of 100\n", "#These appear more stable than with a sample size of 10\n", "die.sample(100).hist(bins = roll_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Sample size of 1000\n", "die.sample(1000).hist(bins = roll_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Another with a sample size of 1000\n", "#More stable than those with a sample size of 10 or 100\n", "#Closer to the theoretical distribution (equal probabilities)\n", "die.sample(1000).hist(bins = roll_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Let's get back to the united data\n", "#Recall the column headers\n", "united.show(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Plot a histogram of the delays\n", "united.hist('Delay', bins = np.arange(-20, 201, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#We missed some values with the bins we specified\n", "min(united.column('Delay')), max(united.column('Delay'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Let's see which row had the 580 minute delay\n", "united.where('Delay', 580)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Check how many were above 200\n", "#We will focus on where the bulk of the data are and remove the 0.8% large values for now\n", "united.where('Delay', are.above(200)).num_rows/united.num_rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#united included all the domestic flights during the summer 2015 departing from San Francisco \n", "#think of united as a population\n", "#now we will draw samples from it and view the corresponding empirical distributions\n", "united.sample(10).hist('Delay', bins = np.arange(-20, 201, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Another empirical distribution with a sample size of 10\n", "united.sample(10).hist('Delay', bins = np.arange(-20, 201, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Empirical distribution with a sample size of 100\n", "united.sample(100).hist('Delay', bins = np.arange(-20, 201, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Another empirical distribution with a sample size of 100\n", "united.sample(100).hist('Delay', bins = np.arange(-20, 201, 10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Simulating Statistics ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Considering united as a population\n", "#The population median delay is:\n", "np.median(united.column('Delay'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Percentage of data less than or equal to the median\n", "united.where('Delay', are.below_or_equal_to(2)).num_rows / united.num_rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#What if we take a random sample of size 10 - what's the estimated median?\n", "np.median(united.sample(10).column('Delay'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Simulate the empirical distribution of the median (statistic) using a sample size of 1000\n", "#We generate 10000 samples of size 1000 (there will be 10000 estimates of the median)\n", "#This cell takes a few seconds to run\n", "medians = make_array()\n", "\n", "for i in np.arange(10000):\n", " new_median = np.median(united.sample(1000).column('Delay'))\n", " medians = np.append(medians, new_median)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Display the empirical distribution of the median as a histogram\n", "Table().with_column('Sample Median', medians).hist(bins = np.arange(-0.5, 5.6, 1))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 1 }