{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lecture 19 ##" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Birth weights" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Birth weight in ounces; Maternal Pregnancy weight in pounds\n", "baby = Table.read_table('baby.csv')\n", "baby" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "smoking_and_birthweight = baby.select('Birth Weight', 'Maternal Smoker')\n", "smoking_and_birthweight" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "smoking_and_birthweight.group('Maternal Smoker')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "smoking_and_birthweight.hist('Birth Weight')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "smoking_and_birthweight.hist('Birth Weight', group='Maternal Smoker')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "observed_means = smoking_and_birthweight.group('Maternal Smoker', np.average)\n", "observed_means" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "################################\n", "observed_diff = observed_means.column(1).item(0) - observed_means.column(1).item(1)\n", "observed_diff" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def diff_between_group_means(tbl):\n", " means = tbl.group('Maternal Smoker', np.average)\n", " return means.column(1).item(0) - means.column(1).item(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# PLAN:\n", "\n", "# Shuffle birth weights\n", "\n", "# Assign some to group A and some to group B\n", "\n", "# Find difference between averages of the two groups (statistic)\n", "\n", "# Repeat" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "weights = smoking_and_birthweight.select('Birth Weight')\n", "weights" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "smoking = smoking_and_birthweight.select('Maternal Smoker')\n", "smoking" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Shuffle birth weights\n", "shuffled_weights = weights.sample(with_replacement=False).column(0)\n", "shuffled_weights" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Assign some to group A and some to group B\n", "shuffled = smoking.with_column('Shuffled weights', shuffled_weights)\n", "shuffled" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Find difference between averages of the two groups (statistic)\n", "diff = diff_between_group_means(shuffled)\n", "diff" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Repeat\n", "diffs = make_array()\n", "for i in np.arange(2000):\n", " shuffled_weights = weights.sample(with_replacement=False).column(0)\n", " shuffled = smoking.with_column('Shuffled weights', shuffled_weights)\n", " means = shuffled.group('Maternal Smoker', np.average)\n", " diff = means.column(1).item(0) - means.column(1).item(1)\n", " diffs = np.append(diffs, diff)\n", "\n", "diffs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Table().with_column('Difference between group means', diffs).hist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "observed_diff" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Deflategate ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Pressure measured in \"pounds per square inch\" (psi)\n", "#Two official (Blakeman and Prioleau) measured pressure \n", "# of balls at half-time\n", "# Most of Colts balls were not measured\n", "football = Table.read_table('deflategate.csv')\n", "football.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Take average of two measurements\n", "football = football.select('Team').with_column(\n", " 'Combined', (football.column('Blakeman')+football.column('Prioleau'))/2\n", " )\n", "football.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.ones(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Before the start of AFC game, ball pressures were measured\n", "#NFL rule: ball pressure between 12.5 and 13.5 psi\n", "#Patriots' balls were all about 12.5 psi\n", "#Colts' balls were about 13.0 psi\n", "start = np.append(12.5 * np.ones(11), 13 * np.ones(4))\n", "start" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Some deflation is normal during game; investigate the pressure drop\n", "drop_values = start - football.column('Combined')\n", "drop_values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "football = football.drop('Combined').with_column('Drop', drop_values)\n", "football.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "means_tbl = football.group('Team', np.average)\n", "means_tbl" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "drop_avgs = means_tbl.column('Drop average')\n", "observed_difference = drop_avgs.item(1) - drop_avgs.item(0)\n", "observed_difference\n", "#Large values imply larger drop for Patriots" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Do the 11 Patriot balls look like a random sample of the 15 balls?\n", "# Could this difference be due to chance?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "group_labels = football.select('Team')\n", "drops = football.select('Drop')\n", "group_labels" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "differences = make_array()\n", "\n", "for i in np.arange(20000):\n", " shuffled_drops = drops.sample(with_replacement = False).column('Drop')\n", " shuffled_tbl = group_labels.with_column('Shuffled Drop', shuffled_drops)\n", " means_tbl = shuffled_tbl.group('Team', np.average)\n", " drop_avgs = means_tbl.column('Shuffled Drop average')\n", " new_diff = drop_avgs.item(1) - drop_avgs.item(0)\n", " differences = np.append(differences, new_diff)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Table().with_column('Difference Between Means', differences).hist()\n", "plots.scatter(observed_difference, 0, color='red', s=40);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.count_nonzero(differences >= observed_difference) / 20000" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 1 }