{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lecture 20" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Deflategate ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "football = Table.read_table('deflategate.csv')\n", "football.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "combined = (football.column('Blakeman')+football.column('Prioleau'))/2\n", "football = football.drop('Blakeman', 'Prioleau').with_column(\n", " 'Combined', \n", " combined)\n", "football.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.ones(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "initial_pressure = np.append(12.5 * np.ones(11), 13 * np.ones(4))\n", "initial_pressure" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "drop_values = initial_pressure - football.column(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "football = football.drop('Combined').with_column('Drop', drop_values)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "football.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "means = football.group('Team', np.average)\n", "means" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "observed_difference = means.column(1).item(0) - means.column(1).item(1)\n", "observed_difference" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def diff_between_means(tbl):\n", " means = tbl.group('Team', np.average).column(1)\n", " return means.item(0) - means.item(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "drops = football.select('Drop')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "shuffled_drops = drops.sample(with_replacement = False).column(0)\n", "shuffled_drops" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "simulated_football = football.with_column('Drop', shuffled_drops)\n", "simulated_football.show(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "diff_between_means(simulated_football)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "differences = make_array()\n", "\n", "for i in np.arange(5000):\n", " shuffled_drops = drops.sample(with_replacement = False).column(0)\n", " simulated_football = football.with_column('Drop', shuffled_drops)\n", " new_diff = diff_between_means(simulated_football)\n", " differences = np.append(differences, new_diff)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Table().with_column('Difference Between Means', differences).hist()\n", "plots.scatter(observed_difference, 0, color='red', s=40);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.average(differences <= observed_difference)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Analyzing RCTs ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Botulinum Toxin A (bta) as a treatment to chronic back pain\n", "# 15 in the treatment group\n", "# 16 in the control group (normal saline)\n", "# trials were run double-blind (neither doctors nor patients knew which group they were in)\n", "# Result = 1 indicates pain relief\n", "bta = Table.read_table('bta.csv')\n", "bta.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bta.group('Group', sum)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bta.group('Group', np.average)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "observed_outcomes = Table.read_table('observed_outcomes.csv')\n", "observed_outcomes.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bta.group('Group', np.average).column(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "abs(0.125 - 0.6)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def distance_between_group_proportions(tbl):\n", " proportions = tbl.group('Group', np.average).column(1)\n", " return abs(proportions.item(1) - proportions.item(0))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "observed_distance = distance_between_group_proportions(bta)\n", "observed_distance" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "labels = bta.select('Group')\n", "results = bta.select('Result')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Repeat\n", "distances = make_array()\n", "for i in np.arange(2000):\n", " shuffled_results = results.sample(with_replacement=False).column(0)\n", " simulated = labels.with_column('Shuffled results', shuffled_results)\n", " distance = distance_between_group_proportions(simulated)\n", " distances = np.append(distances, distance)\n", "\n", "distances" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Table().with_column('Distance', distances).hist(bins = np.arange(0, 1, 0.1))\n", "plots.scatter(observed_distance, 0, color='red', s=40);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.average(distances >= observed_distance)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 1 }