{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Confounding\n", "====\n", "\n", "Date: October 9, 2014\n", "\n", "Copyright (c) 2014 Rafael A. Irizarry MIT License" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%matplotlib inline \n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from bs4 import BeautifulSoup\n", "import requests\n", "from pattern import web\n", "import scipy.stats as stats\n", "import statsmodels.api as sm\n", "from scipy.stats import binom\n", "from __future__ import division\n", "import re\n", "from StringIO import StringIO\n", "from zipfile import ZipFile \n", "from pandas import read_csv\n", "from urllib import urlopen\n", "\n", "#nice defaults for matplotlib\n", "from matplotlib import rcParams\n", "\n", "dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),\n", " (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),\n", " (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),\n", " (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),\n", " (0.4, 0.6509803921568628, 0.11764705882352941),\n", " (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),\n", " (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),\n", " (0.4, 0.4, 0.4)]\n", "\n", "rcParams['figure.figsize'] = (10, 6)\n", "rcParams['figure.dpi'] = 150\n", "rcParams['axes.color_cycle'] = dark2_colors\n", "rcParams['lines.linewidth'] = 2\n", "rcParams['axes.grid'] = True\n", "rcParams['axes.facecolor'] = '#eeeeee'\n", "rcParams['font.size'] = 14\n", "rcParams['patch.edgecolor'] = 'none'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 324 }, { "cell_type": "code", "collapsed": false, "input": [ "zip_folder = requests.get('http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip').content\n", "zip_files = StringIO()\n", "zip_files.write(zip_folder)\n", "csv_files = ZipFile(zip_files)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 325 }, { "cell_type": "code", "collapsed": false, "input": [ "teams = csv_files.open('Teams.csv')\n", "teams = read_csv(teams)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 326 }, { "cell_type": "code", "collapsed": false, "input": [ "dat = teams[(teams['G']==162) & (teams['yearID'] < 2002)]\n", "dat['Singles'] = dat['H']-dat['2B']-dat['3B']-dat['HR']\n", "dat = dat[['R','Singles', 'HR', 'BB']]\n", "dat.head(5)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", " | R | \n", "Singles | \n", "HR | \n", "BB | \n", "
---|---|---|---|---|
437 | \n", "505 | \n", "997 | \n", "11 | \n", "344 | \n", "
1366 | \n", "744 | \n", "902 | \n", "189 | \n", "681 | \n", "
1367 | \n", "683 | \n", "989 | \n", "90 | \n", "580 | \n", "
1377 | \n", "817 | \n", "1041 | \n", "199 | \n", "584 | \n", "
1379 | \n", "718 | \n", "973 | \n", "137 | \n", "602 | \n", "
5 rows \u00d7 4 columns
\n", "\n", " | Major | \n", "Number | \n", "Percent | \n", "Gender | \n", "
---|---|---|---|---|
0 | \n", "A | \n", "825 | \n", "62 | \n", "1 | \n", "
1 | \n", "B | \n", "560 | \n", "63 | \n", "1 | \n", "
2 | \n", "C | \n", "325 | \n", "37 | \n", "1 | \n", "
3 rows \u00d7 4 columns
\n", "\n", " | Major | \n", "Male | \n", "Female | \n", "
---|---|---|---|
0 | \n", "A | \n", "62 | \n", "82 | \n", "
1 | \n", "B | \n", "63 | \n", "68 | \n", "
2 | \n", "C | \n", "37 | \n", "34 | \n", "
3 | \n", "D | \n", "33 | \n", "35 | \n", "
4 | \n", "E | \n", "28 | \n", "24 | \n", "
5 | \n", "F | \n", "6 | \n", "7 | \n", "
6 rows \u00d7 3 columns
\n", "