{ "metadata": { "name": "", "signature": "sha256:9ef8012a6a994c0fe838bff4a0334f1d703a756d7dc3d82b9ad34418b92e21cc" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Interactions and ANOVA" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note: This script is based heavily on Jonathan Taylor's class notes http://www.stanford.edu/class/stats191/interactions.html\n", "\n", "Download and format data:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from __future__ import print_function\n", "from statsmodels.compat import urlopen\n", "import numpy as np\n", "np.set_printoptions(precision=4, suppress=True)\n", "import statsmodels.api as sm\n", "import pandas as pd\n", "pd.set_option(\"display.width\", 100)\n", "import matplotlib.pyplot as plt\n", "from statsmodels.formula.api import ols\n", "from statsmodels.graphics.api import interaction_plot, abline_plot\n", "from statsmodels.stats.anova import anova_lm\n", "\n", "try:\n", " salary_table = pd.read_csv('salary.table')\n", "except: # recent pandas can read URL without urlopen\n", " url = 'http://stats191.stanford.edu/data/salary.table'\n", " fh = urlopen(url)\n", " salary_table = pd.read_table(fh)\n", " salary_table.to_csv('salary.table')\n", "\n", "E = salary_table.E\n", "M = salary_table.M\n", "X = salary_table.X\n", "S = salary_table.S" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Take a look at the data:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "plt.figure(figsize=(6,6))\n", "symbols = ['D', '^']\n", "colors = ['r', 'g', 'blue']\n", "factor_groups = salary_table.groupby(['E','M'])\n", "for values, group in factor_groups:\n", " i,j = values\n", " plt.scatter(group['X'], group['S'], marker=symbols[j], color=colors[i-1],\n", " s=144)\n", "plt.xlabel('Experience');\n", "plt.ylabel('Salary');" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fit a linear model:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "formula = 'S ~ C(E) + C(M) + X'\n", "lm = ols(formula, salary_table).fit()\n", "print(lm.summary())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Have a look at the created design matrix: " ] }, { "cell_type": "code", "collapsed": false, "input": [ "lm.model.exog[:5]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Or since we initially passed in a DataFrame, we have a DataFrame available in" ] }, { "cell_type": "code", "collapsed": false, "input": [ "lm.model.data.orig_exog[:5]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We keep a reference to the original untouched data in" ] }, { "cell_type": "code", "collapsed": false, "input": [ "lm.model.data.frame[:5]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Influence statistics" ] }, { "cell_type": "code", "collapsed": false, "input": [ "infl = lm.get_influence()\n", "print(infl.summary_table())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "or get a dataframe" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df_infl = infl.summary_frame()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df_infl[:5]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now plot the reiduals within the groups separately:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "resid = lm.resid\n", "plt.figure(figsize=(6,6));\n", "for values, group in factor_groups:\n", " i,j = values\n", " group_num = i*2 + j - 1 # for plotting purposes\n", " x = [group_num] * len(group)\n", " plt.scatter(x, resid[group.index], marker=symbols[j], color=colors[i-1],\n", " s=144, edgecolors='black')\n", "plt.xlabel('Group');\n", "plt.ylabel('Residuals');" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we will test some interactions using anova or f_test" ] }, { "cell_type": "code", "collapsed": false, "input": [ "interX_lm = ols(\"S ~ C(E) * X + C(M)\", salary_table).fit()\n", "print(interX_lm.summary())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Do an ANOVA check" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from statsmodels.stats.api import anova_lm\n", "\n", "table1 = anova_lm(lm, interX_lm)\n", "print(table1)\n", "\n", "interM_lm = ols(\"S ~ X + C(E)*C(M)\", data=salary_table).fit()\n", "print(interM_lm.summary())\n", "\n", "table2 = anova_lm(lm, interM_lm)\n", "print(table2)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The design matrix as a DataFrame" ] }, { "cell_type": "code", "collapsed": false, "input": [ "interM_lm.model.data.orig_exog[:5]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The design matrix as an ndarray" ] }, { "cell_type": "code", "collapsed": false, "input": [ "interM_lm.model.exog\n", "interM_lm.model.exog_names" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "infl = interM_lm.get_influence()\n", "resid = infl.resid_studentized_internal\n", "plt.figure(figsize=(6,6))\n", "for values, group in factor_groups:\n", " i,j = values\n", " idx = group.index\n", " plt.scatter(X[idx], resid[idx], marker=symbols[j], color=colors[i-1],\n", " s=144, edgecolors='black')\n", "plt.xlabel('X');\n", "plt.ylabel('standardized resids');" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Looks like one observation is an outlier." ] }, { "cell_type": "code", "collapsed": false, "input": [ "drop_idx = abs(resid).argmax()\n", "print(drop_idx) # zero-based index\n", "idx = salary_table.index.drop(drop_idx)\n", "\n", "lm32 = ols('S ~ C(E) + X + C(M)', data=salary_table, subset=idx).fit()\n", "\n", "print(lm32.summary())\n", "print('\\n')\n", "\n", "interX_lm32 = ols('S ~ C(E) * X + C(M)', data=salary_table, subset=idx).fit()\n", "\n", "print(interX_lm32.summary())\n", "print('\\n')\n", "\n", "\n", "table3 = anova_lm(lm32, interX_lm32)\n", "print(table3)\n", "print('\\n')\n", "\n", "\n", "interM_lm32 = ols('S ~ X + C(E) * C(M)', data=salary_table, subset=idx).fit()\n", "\n", "table4 = anova_lm(lm32, interM_lm32)\n", "print(table4)\n", "print('\\n')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Replot the residuals" ] }, { "cell_type": "code", "collapsed": false, "input": [ "try:\n", " resid = interM_lm32.get_influence().summary_frame()['standard_resid']\n", "except:\n", " resid = interM_lm32.get_influence().summary_frame()['standard_resid']\n", "\n", "plt.figure(figsize=(6,6))\n", "for values, group in factor_groups:\n", " i,j = values\n", " idx = group.index\n", " plt.scatter(X[idx], resid[idx], marker=symbols[j], color=colors[i-1],\n", " s=144, edgecolors='black')\n", "plt.xlabel('X[~[32]]');\n", "plt.ylabel('standardized resids');" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Plot the fitted values" ] }, { "cell_type": "code", "collapsed": false, "input": [ "lm_final = ols('S ~ X + C(E)*C(M)', data = salary_table.drop([drop_idx])).fit()\n", "mf = lm_final.model.data.orig_exog\n", "lstyle = ['-','--']\n", "\n", "plt.figure(figsize=(6,6))\n", "for values, group in factor_groups:\n", " i,j = values\n", " idx = group.index\n", " plt.scatter(X[idx], S[idx], marker=symbols[j], color=colors[i-1],\n", " s=144, edgecolors='black')\n", " # drop NA because there is no idx 32 in the final model\n", " plt.plot(mf.X[idx].dropna(), lm_final.fittedvalues[idx].dropna(),\n", " ls=lstyle[j], color=colors[i-1])\n", "plt.xlabel('Experience');\n", "plt.ylabel('Salary');" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From our first look at the data, the difference between Master's and PhD in the management group is different than in the non-management group. This is an interaction between the two qualitative variables management,M and education,E. We can visualize this by first removing the effect of experience, then plotting the means within each of the 6 groups using interaction.plot." ] }, { "cell_type": "code", "collapsed": false, "input": [ "U = S - X * interX_lm32.params['X']\n", "\n", "plt.figure(figsize=(6,6))\n", "interaction_plot(E, M, U, colors=['red','blue'], markers=['^','D'],\n", " markersize=10, ax=plt.gca())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Minority Employment Data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "try:\n", " minority_table = pd.read_table('minority.table')\n", "except: # don't have data already\n", " url = 'http://stats191.stanford.edu/data/minority.table'\n", " minority_table = pd.read_table(url)\n", "\n", "factor_group = minority_table.groupby(['ETHN'])\n", "\n", "fig, ax = plt.subplots(figsize=(6,6))\n", "colors = ['purple', 'green']\n", "markers = ['o', 'v']\n", "for factor, group in factor_group:\n", " ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],\n", " marker=markers[factor], s=12**2)\n", "ax.set_xlabel('TEST');\n", "ax.set_ylabel('JPERF');" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "min_lm = ols('JPERF ~ TEST', data=minority_table).fit()\n", "print(min_lm.summary())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "fig, ax = plt.subplots(figsize=(6,6));\n", "for factor, group in factor_group:\n", " ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],\n", " marker=markers[factor], s=12**2)\n", "\n", "ax.set_xlabel('TEST')\n", "ax.set_ylabel('JPERF')\n", "fig = abline_plot(model_results = min_lm, ax=ax)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "min_lm2 = ols('JPERF ~ TEST + TEST:ETHN',\n", " data=minority_table).fit()\n", "\n", "print(min_lm2.summary())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "fig, ax = plt.subplots(figsize=(6,6));\n", "for factor, group in factor_group:\n", " ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],\n", " marker=markers[factor], s=12**2)\n", "\n", "fig = abline_plot(intercept = min_lm2.params['Intercept'],\n", " slope = min_lm2.params['TEST'], ax=ax, color='purple');\n", "fig = abline_plot(intercept = min_lm2.params['Intercept'],\n", " slope = min_lm2.params['TEST'] + min_lm2.params['TEST:ETHN'],\n", " ax=ax, color='green');" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "min_lm3 = ols('JPERF ~ TEST + ETHN', data = minority_table).fit()\n", "print(min_lm3.summary())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "fig, ax = plt.subplots(figsize=(6,6));\n", "for factor, group in factor_group:\n", " ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],\n", " marker=markers[factor], s=12**2)\n", "\n", "fig = abline_plot(intercept = min_lm3.params['Intercept'],\n", " slope = min_lm3.params['TEST'], ax=ax, color='purple');\n", "fig = abline_plot(intercept = min_lm3.params['Intercept'] + min_lm3.params['ETHN'],\n", " slope = min_lm3.params['TEST'], ax=ax, color='green');" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "min_lm4 = ols('JPERF ~ TEST * ETHN', data = minority_table).fit()\n", "print(min_lm4.summary())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "fig, ax = plt.subplots(figsize=(8,6));\n", "for factor, group in factor_group:\n", " ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],\n", " marker=markers[factor], s=12**2)\n", "\n", "fig = abline_plot(intercept = min_lm4.params['Intercept'],\n", " slope = min_lm4.params['TEST'], ax=ax, color='purple');\n", "fig = abline_plot(intercept = min_lm4.params['Intercept'] + min_lm4.params['ETHN'],\n", " slope = min_lm4.params['TEST'] + min_lm4.params['TEST:ETHN'],\n", " ax=ax, color='green');" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# is there any effect of ETHN on slope or intercept?\n", "table5 = anova_lm(min_lm, min_lm4)\n", "print(table5)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# is there any effect of ETHN on intercept\n", "table6 = anova_lm(min_lm, min_lm3)\n", "print(table6)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# is there any effect of ETHN on slope\n", "table7 = anova_lm(min_lm, min_lm2)\n", "print(table7)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# is it just the slope or both?\n", "table8 = anova_lm(min_lm2, min_lm4)\n", "print(table8)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## One-way ANOVA" ] }, { "cell_type": "code", "collapsed": false, "input": [ "try:\n", " rehab_table = pd.read_csv('rehab.table')\n", "except:\n", " url = 'http://stats191.stanford.edu/data/rehab.csv'\n", " rehab_table = pd.read_table(url, delimiter=\",\")\n", " rehab_table.to_csv('rehab.table')\n", "\n", "fig, ax = plt.subplots(figsize=(8,6))\n", "fig = rehab_table.boxplot('Time', 'Fitness', ax=ax, grid=False)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "rehab_lm = ols('Time ~ C(Fitness)', data=rehab_table).fit()\n", "table9 = anova_lm(rehab_lm)\n", "print(table9)\n", "\n", "print(rehab_lm.model.data.orig_exog)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print(rehab_lm.summary())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Two-way ANOVA" ] }, { "cell_type": "code", "collapsed": false, "input": [ "try:\n", " kidney_table = pd.read_table('./kidney.table')\n", "except:\n", " url = 'http://stats191.stanford.edu/data/kidney.table'\n", " kidney_table = pd.read_table(url, delimiter=\" *\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Explore the dataset" ] }, { "cell_type": "code", "collapsed": false, "input": [ "kidney_table.groupby(['Weight', 'Duration']).size()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Balanced panel" ] }, { "cell_type": "code", "collapsed": false, "input": [ "kt = kidney_table\n", "plt.figure(figsize=(8,6))\n", "fig = interaction_plot(kt['Weight'], kt['Duration'], np.log(kt['Days']+1),\n", " colors=['red', 'blue'], markers=['D','^'], ms=10, ax=plt.gca())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You have things available in the calling namespace available in the formula evaluation namespace" ] }, { "cell_type": "code", "collapsed": false, "input": [ "kidney_lm = ols('np.log(Days+1) ~ C(Duration) * C(Weight)', data=kt).fit()\n", "\n", "table10 = anova_lm(kidney_lm)\n", "\n", "print(anova_lm(ols('np.log(Days+1) ~ C(Duration) + C(Weight)',\n", " data=kt).fit(), kidney_lm))\n", "print(anova_lm(ols('np.log(Days+1) ~ C(Duration)', data=kt).fit(),\n", " ols('np.log(Days+1) ~ C(Duration) + C(Weight, Sum)',\n", " data=kt).fit()))\n", "print(anova_lm(ols('np.log(Days+1) ~ C(Weight)', data=kt).fit(),\n", " ols('np.log(Days+1) ~ C(Duration) + C(Weight, Sum)',\n", " data=kt).fit()))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sum of squares\n", "\n", " Illustrates the use of different types of sums of squares (I,II,II)\n", " and how the Sum contrast can be used to produce the same output between\n", " the 3.\n", "\n", " Types I and II are equivalent under a balanced design.\n", "\n", " Don't use Type III with non-orthogonal contrast - ie., Treatment" ] }, { "cell_type": "code", "collapsed": false, "input": [ "sum_lm = ols('np.log(Days+1) ~ C(Duration, Sum) * C(Weight, Sum)',\n", " data=kt).fit()\n", "\n", "print(anova_lm(sum_lm))\n", "print(anova_lm(sum_lm, typ=2))\n", "print(anova_lm(sum_lm, typ=3))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "nosum_lm = ols('np.log(Days+1) ~ C(Duration, Treatment) * C(Weight, Treatment)',\n", " data=kt).fit()\n", "print(anova_lm(nosum_lm))\n", "print(anova_lm(nosum_lm, typ=2))\n", "print(anova_lm(nosum_lm, typ=3))" ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }