{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lecture 17 ##" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Mendel and Pea Flowers ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = make_array(0.75, 0.25)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_proportions(929, model)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# statistic: distance between sample percent (of purple plants) and 75\n", "\n", "abs(100 * sample_proportions(929, model).item(0) - 75)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Simulation\n", "\n", "distances = make_array()\n", "\n", "for i in np.arange(10000):\n", " new_distance = abs(100 * sample_proportions(929, model).item(0) - 75)\n", " distances = np.append(distances, new_distance)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Table().with_column('Distance from 75%', distances).hist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 705 of Mendel's 929 plants were purple flowering\n", "\n", "observed_distance = abs(100*(705/929) - 75)\n", "observed_distance" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Table().with_column('Distance from 75%', distances).hist()\n", "plots.scatter(observed_distance, 0, color='red', s=30);" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Alameda County Jury Panels ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "jury = Table().with_columns(\n", " 'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),\n", " 'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),\n", " 'Panels', make_array(0.26, 0.08, 0.08, 0.54, 0.04)\n", ")\n", "\n", "jury" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "jury.barh('Ethnicity')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#####" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "jury_with_diffs = jury.with_column(\n", " 'Difference', jury.column('Panels') - jury.column('Eligible')\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "jury_with_diffs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "jury_with_diffs = jury_with_diffs.with_column(\n", " 'Absolute Difference', np.abs(jury_with_diffs.column('Difference'))\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "jury_with_diffs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sum(jury_with_diffs.column('Absolute Difference'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sum(jury_with_diffs.column('Absolute Difference')) / 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def total_variation_distance(distribution_1, distribution_2):\n", " return sum(np.abs(distribution_1 - distribution_2)) / 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "total_variation_distance(jury.column('Eligible'), jury.column('Panels'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "eligible = jury.column('Eligible')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_distribution = sample_proportions(1453, eligible)\n", "panels_and_sample = jury.with_column('Random Sample', sample_distribution)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "panels_and_sample" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "panels_and_sample.barh('Ethnicity')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "total_variation_distance(panels_and_sample.column('Random Sample'), eligible)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "observed_tvd = total_variation_distance(jury.column('Panels'), eligible)\n", "observed_tvd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_distribution = sample_proportions(1453, eligible)\n", "total_variation_distance(sample_distribution, eligible)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tvds = make_array()\n", "\n", "for i in np.arange(10000):\n", " sample_distribution = sample_proportions(1453, eligible)\n", " new_tvd = total_variation_distance(sample_distribution, eligible)\n", " tvds = np.append(tvds, new_tvd)\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Table().with_column('Total Variation Distance', tvds).hist(bins = 20)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 1 }