{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datascience import *\n",
    "import numpy as np\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plots\n",
    "plots.style.use('fivethirtyeight')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lecture 17 ##"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Mendel and Pea Flowers ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = make_array(0.75, 0.25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_proportions(929, model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# statistic: distance between sample percent (of purple plants) and 75\n",
    "\n",
    "abs(100 * sample_proportions(929, model).item(0) - 75)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Simulation\n",
    "\n",
    "distances = make_array()\n",
    "\n",
    "for i in np.arange(10000):\n",
    "    new_distance = abs(100 * sample_proportions(929, model).item(0) - 75)\n",
    "    distances = np.append(distances, new_distance)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Table().with_column('Distance from 75%', distances).hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 705 of Mendel's 929 plants were purple flowering\n",
    "\n",
    "observed_distance =  abs(100*(705/929) - 75)\n",
    "observed_distance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Table().with_column('Distance from 75%', distances).hist()\n",
    "plots.scatter(observed_distance, 0, color='red', s=30);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Alameda County Jury Panels ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "jury = Table().with_columns(\n",
    "    'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),\n",
    "    'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),\n",
    "    'Panels', make_array(0.26, 0.08, 0.08, 0.54, 0.04)\n",
    ")\n",
    "\n",
    "jury"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "jury.barh('Ethnicity')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#####"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "jury_with_diffs = jury.with_column(\n",
    "    'Difference', jury.column('Panels') - jury.column('Eligible')\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "jury_with_diffs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "jury_with_diffs = jury_with_diffs.with_column(\n",
    "    'Absolute Difference', np.abs(jury_with_diffs.column('Difference'))\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "jury_with_diffs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sum(jury_with_diffs.column('Absolute Difference'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sum(jury_with_diffs.column('Absolute Difference')) / 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def total_variation_distance(distribution_1, distribution_2):\n",
    "    return sum(np.abs(distribution_1 - distribution_2)) / 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "total_variation_distance(jury.column('Eligible'), jury.column('Panels'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "eligible = jury.column('Eligible')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_distribution = sample_proportions(1453, eligible)\n",
    "panels_and_sample = jury.with_column('Random Sample', sample_distribution)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "panels_and_sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "panels_and_sample.barh('Ethnicity')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "total_variation_distance(panels_and_sample.column('Random Sample'), eligible)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "observed_tvd = total_variation_distance(jury.column('Panels'), eligible)\n",
    "observed_tvd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_distribution = sample_proportions(1453, eligible)\n",
    "total_variation_distance(sample_distribution, eligible)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tvds = make_array()\n",
    "\n",
    "for i in np.arange(10000):\n",
    "    sample_distribution = sample_proportions(1453, eligible)\n",
    "    new_tvd = total_variation_distance(sample_distribution, eligible)\n",
    "    tvds = np.append(tvds, new_tvd)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Table().with_column('Total Variation Distance', tvds).hist(bins = 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}