{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Regression and Prediction\n", "====\n", "\n", "Date: October 2, 2014\n", "\n", "Copyright (c) 2014 Rafael A. Irizarry MIT License" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%matplotlib inline \n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from bs4 import BeautifulSoup\n", "import requests\n", "from pattern import web\n", "import scipy.stats as stats\n", "import statsmodels.api as sm\n", "from scipy.stats import binom\n", "from __future__ import division\n", "import re\n", "from StringIO import StringIO\n", "from zipfile import ZipFile \n", "from pandas import read_csv\n", "\n", "#nice defaults for matplotlib\n", "from matplotlib import rcParams\n", "\n", "dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),\n", " (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),\n", " (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),\n", " (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),\n", " (0.4, 0.6509803921568628, 0.11764705882352941),\n", " (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),\n", " (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),\n", " (0.4, 0.4, 0.4)]\n", "\n", "rcParams['figure.figsize'] = (10, 6)\n", "rcParams['figure.dpi'] = 150\n", "rcParams['axes.color_cycle'] = dark2_colors\n", "rcParams['lines.linewidth'] = 2\n", "rcParams['axes.grid'] = True\n", "rcParams['axes.facecolor'] = '#eeeeee'\n", "rcParams['font.size'] = 14\n", "rcParams['patch.edgecolor'] = 'none'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Baseball\n", "===\n", "\n", "\n", "Baseball\n", "===\n", "\n", "\n", "Statistics\n", "====\n", "\n", "\n", "Sabermetrics\n", "====\n", "\n", "\n", "\n", "Bill James " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Some Terms\n", "===\n", "- R - Runs\n", "- AB - At bats\n", "- SO - Strike out\n", "- H - Hitter puts the ball in play and is not out\n", "- 2B - Hitter gets to second\n", "- 3B - Hitter gets to third\n", "- HR - Home Run, hitter scores a run\n", "- BB - Base on balls\n", "- PA - Plate apperances $\\approx$ AB+BB" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Tables\n", "====" ] }, { "cell_type": "code", "collapsed": false, "input": [ "zip_folder = requests.get('http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip').content\n", "zip_files = StringIO()\n", "zip_files.write(zip_folder)\n", "csv_files = ZipFile(zip_files)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "teams = csv_files.open('Teams.csv')\n", "teams = read_csv(teams)\n", "\n", "players = csv_files.open('Batting.csv')\n", "players = read_csv(players)\n", "\n", "salaries = csv_files.open('Salaries.csv')\n", "salaries = read_csv(salaries)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "dat = teams[(teams['G'] == 162) & (teams['yearID']<2002) ]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "dat[[\"teamID\",\"yearID\", \"H\", \"2B\", \"3B\", \"HR\", \"BB\"]].head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", " | teamID | \n", "yearID | \n", "H | \n", "2B | \n", "3B | \n", "HR | \n", "BB | \n", "
---|---|---|---|---|---|---|---|
437 | \n", "DET | \n", "1904 | \n", "1231 | \n", "154 | \n", "69 | \n", "11 | \n", "344 | \n", "
1366 | \n", "LAA | \n", "1961 | \n", "1331 | \n", "218 | \n", "22 | \n", "189 | \n", "681 | \n", "
1367 | \n", "KC1 | \n", "1961 | \n", "1342 | \n", "216 | \n", "47 | \n", "90 | \n", "580 | \n", "
1377 | \n", "NYA | \n", "1962 | \n", "1509 | \n", "240 | \n", "29 | \n", "199 | \n", "584 | \n", "
1379 | \n", "LAA | \n", "1962 | \n", "1377 | \n", "232 | \n", "35 | \n", "137 | \n", "602 | \n", "
5 rows \u00d7 7 columns
\n", "\n", " | playerID | \n", "avg1 | \n", "avg2 | \n", "
---|---|---|---|
896 | \n", "olerujo01 | \n", "363 | \n", "294 | \n", "
113 | \n", "boggswa01 | \n", "361 | \n", "325 | \n", "
210 | \n", "cashno01 | \n", "361 | \n", "243 | \n", "
1020 | \n", "rodrial01 | \n", "358 | \n", "300 | \n", "
751 | \n", "madlobi01 | \n", "354 | \n", "339 | \n", "
490 | \n", "gwynnto01 | \n", "351 | \n", "317 | \n", "
1135 | \n", "suzukic01 | \n", "350 | \n", "321 | \n", "
780 | \n", "mauerjo01 | \n", "347 | \n", "328 | \n", "
304 | \n", "davisto02 | \n", "346 | \n", "326 | \n", "
789 | \n", "mayswi01 | \n", "345 | \n", "319 | \n", "
10 rows \u00d7 3 columns
\n", "\n", " | playerID | \n", "avg1 | \n", "avg2 | \n", "
---|---|---|---|
629 | \n", "joosted01 | \n", "206 | \n", "250 | \n", "
963 | \n", "priddje01 | \n", "214 | \n", "296 | \n", "
842 | \n", "moneydo01 | \n", "222 | \n", "284 | \n", "
533 | \n", "hernaen01 | \n", "222 | \n", "232 | \n", "
1117 | \n", "staubru01 | \n", "224 | \n", "280 | \n", "
132 | \n", "boyercl02 | \n", "224 | \n", "272 | \n", "
689 | \n", "lanieha01 | \n", "226 | \n", "213 | \n", "
871 | \n", "murphda05 | \n", "226 | \n", "281 | \n", "
160 | \n", "brunato01 | \n", "227 | \n", "254 | \n", "
1111 | \n", "stallvi01 | \n", "228 | \n", "254 | \n", "
10 rows \u00d7 3 columns
\n", "