{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Distributions ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "die = Table().with_column('Face', np.arange(1, 7))\n", "die" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "die.sample(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "roll_bins = np.arange(0.5, 6.6, 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "die.hist(bins = roll_bins)\n", "print(1/6)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "die.sample(10).hist(bins = roll_bins)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Large Random Samples " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "die.sample(10).hist(bins = roll_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "die.sample(10).hist(bins = roll_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "die.sample(100).hist(bins = roll_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "die.sample(100).hist(bins = roll_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "die.sample(1000).hist(bins = roll_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "die.sample(1000).hist(bins = roll_bins)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "united = Table.read_table('united_summer2015.csv')\n", "united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "united " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "united.hist('Delay', bins = np.arange(-20, 201, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "united.sample(10).hist('Delay', bins = np.arange(-20, 201, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "united.sample(10).hist('Delay', bins = np.arange(-20, 201, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "united.sample(100).hist('Delay', bins = np.arange(-20, 201, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "united.sample(100).hist('Delay', bins = np.arange(-20, 201, 10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Simulating a Statistic ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.median(united.column('Delay'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "united.where('Delay', are.below_or_equal_to(2)).num_rows / united.num_rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.median(united.sample(10).column('Delay'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "medians = make_array()\n", "\n", "for i in np.arange(10000):\n", " new_median = np.median(united.sample(1000).column('Delay'))\n", " medians = np.append(medians, new_median)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Table().with_column('Sample Median', medians).hist(bins = np.arange(-0.5, 5.6, 1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Swain vs. Alabama ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "eligible_population = make_array(0.26, 0.74)\n", "eligible_population" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_proportions(100, eligible_population)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# statistic: number of black men among random sample \n", "# of 100 men from eligible population\n", "\n", "100 * sample_proportions(100, eligible_population).item(0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Simulation\n", "\n", "counts = make_array()\n", "\n", "for i in np.arange(10000):\n", " new_count = 100 * sample_proportions(100, eligible_population).item(0)\n", " counts = np.append(counts, new_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Visualization\n", "\n", "Table().with_column('Random Sample Count', counts).hist(bins = np.arange(9.5, 45, 1))\n", "\n", "observed_count = 8\n", "plots.scatter(observed_count, 0, color='red', s=50);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Mendel and Pea Flowers ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = make_array(0.75, 0.25)\n", "model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_proportions(929, model)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# statistic: distance between sample percent (of purple plants) and 75\n", "\n", "abs(100 * sample_proportions(929, model).item(0) - 75)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Simulation\n", "\n", "distances = make_array()\n", "\n", "for i in np.arange(10000):\n", " new_distance = abs(100 * sample_proportions(929, model).item(0) - 75)\n", " distances = np.append(distances, new_distance)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Table().with_column('Distance from 75%', distances).hist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "observed_distance = abs(100*(705/929) - 75)\n", "observed_distance" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Table().with_column('Distance from 75%', distances).hist()\n", "plots.scatter(observed_distance, 0, color='red', s=30);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 1 }