{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Bayesian Statistics\n", "====\n", "Date: October 16, 2014\n", "\n", "Copyright (c) 2014 Rafael A. Irizarry MIT License" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# special IPython command to prepare the notebook for matplotlib\n", "%matplotlib inline \n", "\n", "import numpy as np\n", "import scipy\n", "import pandas as pd # pandas\n", "import matplotlib.pyplot as plt # module for plotting \n", "from mpl_toolkits.mplot3d import Axes3D #3D plotting\n", "import datetime as dt # module for manipulating dates and times\n", "\n", "import requests \n", "import scipy.stats as stats\n", "import statsmodels.api as sm\n", "from scipy.stats import binom\n", "from __future__ import division\n", "import re\n", "from StringIO import StringIO\n", "from zipfile import ZipFile \n", "from pandas import read_csv\n", "from urllib import urlopen\n", "import urllib2\n", "import json\n", "\n", "import sklearn\n", "import sklearn.preprocessing\n", "import sklearn.datasets\n", "\n", "#nice defaults for matplotlib\n", "from matplotlib import rcParams\n", "\n", "dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),\n", " (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),\n", " (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),\n", " (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),\n", " (0.4, 0.6509803921568628, 0.11764705882352941),\n", " (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),\n", " (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),\n", " (0.4, 0.4, 0.4)]\n", "\n", "rcParams['figure.figsize'] = (10, 6)\n", "rcParams['figure.dpi'] = 150\n", "rcParams['axes.color_cycle'] = dark2_colors\n", "rcParams['lines.linewidth'] = 2\n", "rcParams['axes.grid'] = True\n", "rcParams['axes.facecolor'] = '#eeeeee'\n", "rcParams['font.size'] = 14\n", "rcParams['patch.edgecolor'] = 'none'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 289 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Cystic Fibrosis Test\n", "====\n", "\n", "* A test for cystic fibrosis has an accuracy of 99%: \n", "\n", "$$\\mbox{Prob}(+|D)=0.99, \\mbox{Prob}(-|\\mbox{no } D)=0.99,$$\n", "\n", "\n", "* If we select random person and they test postive what is probability of positive test? \n", "\n", "* We write this as $\\mbox{Prob}(D|+)?$\n", "\n", "* cystic fibrosis rate is 1 in 3,900, $\\mbox{Prob}(D)=0.0025$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Bayes Rule\n", "====\n", "\n", "\n", "$$\n", "\\mbox{Pr}(A|B) = \\frac{\\mbox{Pr}(B|A)\\mbox{Pr}(A)}{\\mbox{Pr}(B)} \n", "$$\n", "\n", "$$\n", "\\begin{eqnarray*}\n", "\\mbox{Prob}(D|+) & = & \\frac{ P(+|D) \\cdot P(D)} {\\mbox{Prob}(+)} \\\\\n", "& = & \\frac{\\mbox{Prob}(+|D)\\cdot P(D)} {\\mbox{Prob}(+|D) \\cdot P(D) + \\mbox{Prob}(+|\\mbox{no } D) \\mbox{Prob}(\\mbox{no } D)} \\\\\n", "\\end{eqnarray*}\n", "$$\n", "\n", "$$\n", "\\begin{eqnarray*}\n", "\\mbox{Prob}(D|+) & = & \\frac{ P(+|D) \\cdot P(D)} {\\mbox{Prob}(+)} \\\\\n", "& = & \\frac{\\mbox{Prob}(+|D)\\cdot P(D)} {\\mbox{Prob}(+|D) \\cdot P(D) + \\mbox{Prob}(+|\\mbox{no } D) \\mbox{Prob}(\\mbox{no } D)} \\\\\n", "& = & \\frac{0.99 \\cdot 0.0025}{0.99 \\cdot 0.0025 + 0.01 \\cdot (.9975)} \\\\\n", "& = & 0.02 \\;\\;\\; \\mbox{not} \\; \\; \\; 0.99\n", "\\end{eqnarray*}\n", "$$\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Simulation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Bayes in Practice\n", "===\n", "\n", "![iglesias](http://upload.wikimedia.org/wikipedia/commons/thumb/9/98/Jos%C3%A9_Iglesias_on_September_28%2C_2012.jpg/902px-Jos%C3%A9_Iglesias_on_September_28%2C_2012.jpg)\n", "\n", "***\n", "\n", "Jos\u00e9 Iglesias 2013\n", "\n", "| Month | At Bats | H | AVG |\n", "|-------|---------|---|-----|\n", "| April | 20 | 9 | .450 |\n", "\n", "What is your prediction for his average in October?\n", "\n", "Note: No one has finished a \n", "season batting .400 since \n", "Ted Williams in 1941!\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Distribution of AVG \n", "===\n", "This is for all players (>500 AB) 2010, 2011, 2012" ] }, { "cell_type": "code", "collapsed": false, "input": [ "zip_folder = requests.get('http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip').content\n", "zip_files = StringIO()\n", "zip_files.write(zip_folder)\n", "csv_files = ZipFile(zip_files)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 290 }, { "cell_type": "code", "collapsed": false, "input": [ "players = csv_files.open('Batting.csv')\n", "players = read_csv(players)\n", "players.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", " | playerID | \n", "yearID | \n", "stint | \n", "teamID | \n", "lgID | \n", "G | \n", "G_batting | \n", "AB | \n", "R | \n", "H | \n", "2B | \n", "3B | \n", "HR | \n", "RBI | \n", "SB | \n", "CS | \n", "BB | \n", "SO | \n", "IBB | \n", "HBP | \n", "\n", " |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "aardsda01 | \n", "2004 | \n", "1 | \n", "SFN | \n", "NL | \n", "11 | \n", "11 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "
1 | \n", "aardsda01 | \n", "2006 | \n", "1 | \n", "CHN | \n", "NL | \n", "45 | \n", "43 | \n", "2 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "
2 | \n", "aardsda01 | \n", "2007 | \n", "1 | \n", "CHA | \n", "AL | \n", "25 | \n", "2 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "
3 | \n", "aardsda01 | \n", "2008 | \n", "1 | \n", "BOS | \n", "AL | \n", "47 | \n", "5 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "... | \n", "
4 | \n", "aardsda01 | \n", "2009 | \n", "1 | \n", "SEA | \n", "AL | \n", "73 | \n", "3 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "
5 rows \u00d7 24 columns
\n", "\n", " | states | \n", "R | \n", "D | \n", "incumbent | \n", "
---|---|---|---|---|
0 | \n", "KY | \n", "McConnell | \n", "Grimes | \n", "Rep | \n", "
1 | \n", "AR | \n", "Cotton | \n", "Pryor | \n", "Dem | \n", "
2 | \n", "MI | \n", "Land | \n", "Peters | \n", "NaN | \n", "
3 | \n", "LA | \n", "Cassidy | \n", "Landrieu | \n", "Dem | \n", "
4 | \n", "NH | \n", "Brown | \n", "Shaheen | \n", "Dem | \n", "
5 rows \u00d7 4 columns
\n", "