{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lecture 23 ##" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Percentiles" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s = [1, 7, 3, 9, 5]\n", "np.sort(s)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "percentile(10, s)==0" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "percentile(39, s) == percentile(40, s)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "percentile(40, s) == percentile(41, s)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "percentile(50, s) == 5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x = make_array(43, 20, 51, 7, 28, 34)\n", "y = np.sort(x)\n", "y" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ ".55 * len(x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "percentile(55, x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ ".95 * len(x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "percentile(95, x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ ".99 * len(x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "percentile(99, x)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Bootstrap" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Compensation data for employees of the city of San Francisco in 2015\n", "sf = Table.read_table('san_francisco_2015.csv')\n", "sf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sf.where('Job', 'Mayor')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sf.sort('Total Compensation', descending=True).show(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sf.sort('Total Compensation', descending=False).show(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sf = sf.where('Total Compensation', are.above(10000))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sf.num_rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sf_bins = np.arange(0, 700000, 25000)\n", "sf.hist('Total Compensation', bins=sf_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pop_median = percentile(50, sf.column('Total Compensation'))\n", "pop_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "our_sample = sf.sample(300, with_replacement=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "our_sample.hist('Total Compensation', bins=sf_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "est_median = percentile(50, our_sample.column('Total Compensation'))\n", "est_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#################################\n", "resample1 = our_sample.sample()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "percentile(50, resample1.column('Total Compensation'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def bootstrap_median(original_sample, label, replications):\n", " \"\"\"Simulate sample median:\n", " original_sample: table containing the original sample\n", " label: label of column containing the variable\n", " replications: number of bootstrap samples\n", " Returns array of bootstrap sample medians\n", " \"\"\"\n", " medians = make_array()\n", " for i in np.arange(replications):\n", " bootstrap_sample = original_sample.sample()\n", " resampled_median = percentile(50, bootstrap_sample.column(label))\n", " medians = np.append(medians, resampled_median)\n", " \n", " return medians" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bstrap_medians = bootstrap_median(our_sample, 'Total Compensation', 2000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "resampled_medians = Table().with_column(\n", " 'Bootstrap Sample Median', bstrap_medians)\n", "\n", "resampled_medians.hist()\n", "\n", "plots.scatter(pop_median, 0, color='red', s=40);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "left = percentile(2.5, bstrap_medians)\n", "left" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "right = percentile(97.5, bstrap_medians)\n", "right" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "resampled_medians.hist()\n", "\n", "plots.plot([left, right], [0, 0], color='yellow', lw=10, zorder=1)\n", "plots.scatter(pop_median, 0, color='red', s=50, zorder=2);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "confidence_interval = make_array(left, right)\n", "confidence_interval" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# THE BIG SIMULATION: This one takes a long time.\n", "\n", "# Generate the endpoints of 50 intervals\n", "\n", "left_ends = make_array()\n", "right_ends = make_array()\n", "\n", "for i in np.arange(50):\n", " first_sample = sf.sample(300, with_replacement=False)\n", " medians = bootstrap_median(first_sample, 'Total Compensation', 2000)\n", " left_ends = np.append(left_ends, percentile(2.5, medians))\n", " right_ends = np.append(right_ends, percentile(97.5, medians))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "intervals = Table().with_columns(\n", " 'Left', left_ends,\n", " 'Right', right_ends\n", ") " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "intervals" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "good = intervals.where('Left', are.below(pop_median)).where('Right', are.above(pop_median)).num_rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "good / 50" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 1 }