{ "metadata": { "name": "", "signature": "sha256:a992c3ee53b62d7c984ded70673dee2cb0d547a694dcf174a874b3a61052df9c" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "M-Estimators for Robust Linear Modeling" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from __future__ import print_function\n", "from statsmodels.compat import lmap\n", "import numpy as np\n", "from scipy import stats\n", "import matplotlib.pyplot as plt\n", "\n", "import statsmodels.api as sm" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* An M-estimator minimizes the function \n", "\n", "$$Q(e_i, \\rho) = \\sum_i~\\rho \\left (\\frac{e_i}{s}\\right )$$\n", "\n", "where $\\rho$ is a symmetric function of the residuals \n", "\n", "* The effect of $\\rho$ is to reduce the influence of outliers\n", "* $s$ is an estimate of scale. \n", "* The robust estimates $\\hat{\\beta}$ are computed by the iteratively re-weighted least squares algorithm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* We have several choices available for the weighting functions to be used" ] }, { "cell_type": "code", "collapsed": false, "input": [ "norms = sm.robust.norms" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "def plot_weights(support, weights_func, xlabels, xticks):\n", " fig = plt.figure(figsize=(12,8))\n", " ax = fig.add_subplot(111)\n", " ax.plot(support, weights_func(support))\n", " ax.set_xticks(xticks)\n", " ax.set_xticklabels(xlabels, fontsize=16)\n", " ax.set_ylim(-.1, 1.1)\n", " return ax" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Andrew's Wave" ] }, { "cell_type": "code", "collapsed": false, "input": [ "help(norms.AndrewWave.weights)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "a = 1.339\n", "support = np.linspace(-np.pi*a, np.pi*a, 100)\n", "andrew = norms.AndrewWave(a=a)\n", "plot_weights(support, andrew.weights, ['$-\\pi*a$', '0', '$\\pi*a$'], [-np.pi*a, 0, np.pi*a]);" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Hampel's 17A" ] }, { "cell_type": "code", "collapsed": false, "input": [ "help(norms.Hampel.weights)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "c = 8\n", "support = np.linspace(-3*c, 3*c, 1000)\n", "hampel = norms.Hampel(a=2., b=4., c=c)\n", "plot_weights(support, hampel.weights, ['3*c', '0', '3*c'], [-3*c, 0, 3*c]);" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Huber's t" ] }, { "cell_type": "code", "collapsed": false, "input": [ "help(norms.HuberT.weights)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "t = 1.345\n", "support = np.linspace(-3*t, 3*t, 1000)\n", "huber = norms.HuberT(t=t)\n", "plot_weights(support, huber.weights, ['-3*t', '0', '3*t'], [-3*t, 0, 3*t]);" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Least Squares" ] }, { "cell_type": "code", "collapsed": false, "input": [ "help(norms.LeastSquares.weights)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "support = np.linspace(-3, 3, 1000)\n", "lst_sq = norms.LeastSquares()\n", "plot_weights(support, lst_sq.weights, ['-3', '0', '3'], [-3, 0, 3]);" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Ramsay's Ea" ] }, { "cell_type": "code", "collapsed": false, "input": [ "help(norms.RamsayE.weights)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "a = .3\n", "support = np.linspace(-3*a, 3*a, 1000)\n", "ramsay = norms.RamsayE(a=a)\n", "plot_weights(support, ramsay.weights, ['-3*a', '0', '3*a'], [-3*a, 0, 3*a]);" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Trimmed Mean" ] }, { "cell_type": "code", "collapsed": false, "input": [ "help(norms.TrimmedMean.weights)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "c = 2\n", "support = np.linspace(-3*c, 3*c, 1000)\n", "trimmed = norms.TrimmedMean(c=c)\n", "plot_weights(support, trimmed.weights, ['-3*c', '0', '3*c'], [-3*c, 0, 3*c]);" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Tukey's Biweight" ] }, { "cell_type": "code", "collapsed": false, "input": [ "help(norms.TukeyBiweight.weights)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "c = 4.685\n", "support = np.linspace(-3*c, 3*c, 1000)\n", "tukey = norms.TukeyBiweight(c=c)\n", "plot_weights(support, tukey.weights, ['-3*c', '0', '3*c'], [-3*c, 0, 3*c]);" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Scale Estimators" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Robust estimates of the location" ] }, { "cell_type": "code", "collapsed": false, "input": [ "x = np.array([1, 2, 3, 4, 500])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* The mean is not a robust estimator of location" ] }, { "cell_type": "code", "collapsed": false, "input": [ "x.mean()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* The median, on the other hand, is a robust estimator with a breakdown point of 50%" ] }, { "cell_type": "code", "collapsed": false, "input": [ "np.median(x)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Analagously for the scale\n", "* The standard deviation is not robust" ] }, { "cell_type": "code", "collapsed": false, "input": [ "x.std()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Median Absolute Deviation\n", "\n", "$$ median_i |X_i - median_j(X_j)|) $$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Standardized Median Absolute Deviation is a consistent estimator for $\\hat{\\sigma}$\n", "\n", "$$\\hat{\\sigma}=K \\cdot MAD$$\n", "\n", "where $K$ depends on the distribution. For the normal distribution for example,\n", "\n", "$$K = \\Phi^{-1}(.75)$$" ] }, { "cell_type": "code", "collapsed": false, "input": [ "stats.norm.ppf(.75)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print(x)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "sm.robust.scale.stand_mad(x)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "np.array([1,2,3,4,5.]).std()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* The default for Robust Linear Models is MAD\n", "* another popular choice is Huber's proposal 2" ] }, { "cell_type": "code", "collapsed": false, "input": [ "np.random.seed(12345)\n", "fat_tails = stats.t(6).rvs(40)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "kde = sm.nonparametric.KDE(fat_tails)\n", "kde.fit()\n", "fig = plt.figure(figsize=(12,8))\n", "ax = fig.add_subplot(111)\n", "ax.plot(kde.support, kde.density);" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print(fat_tails.mean(), fat_tails.std())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print(stats.norm.fit(fat_tails))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print(stats.t.fit(fat_tails, f0=6))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "huber = sm.robust.scale.Huber()\n", "loc, scale = huber(fat_tails)\n", "print(loc, scale)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "sm.robust.stand_mad(fat_tails)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "sm.robust.stand_mad(fat_tails, c=stats.t(6).ppf(.75))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "sm.robust.scale.mad(fat_tails)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Duncan's Occupational Prestige data - M-estimation for outliers" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from statsmodels.graphics.api import abline_plot\n", "from statsmodels.formula.api import ols, rlm" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "prestige = sm.datasets.get_rdataset(\"Duncan\", \"car\", cache=True).data" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print(prestige.head(10))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "fig = plt.figure(figsize=(12,12))\n", "ax1 = fig.add_subplot(211, xlabel='Income', ylabel='Prestige')\n", "ax1.scatter(prestige.income, prestige.prestige)\n", "xy_outlier = prestige.ix['minister'][['income','prestige']]\n", "ax1.annotate('Minister', xy_outlier, xy_outlier+1, fontsize=16)\n", "ax2 = fig.add_subplot(212, xlabel='Education',\n", " ylabel='Prestige')\n", "ax2.scatter(prestige.education, prestige.prestige);" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "ols_model = ols('prestige ~ income + education', prestige).fit()\n", "print(ols_model.summary())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "infl = ols_model.get_influence()\n", "student = infl.summary_frame()['student_resid']\n", "print(student)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print(student.ix[np.abs(student) > 2])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print(infl.summary_frame().ix['minister'])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "sidak = ols_model.outlier_test('sidak')\n", "sidak.sort('unadj_p', inplace=True)\n", "print(sidak)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "fdr = ols_model.outlier_test('fdr_bh')\n", "fdr.sort('unadj_p', inplace=True)\n", "print(fdr)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "rlm_model = rlm('prestige ~ income + education', prestige).fit()\n", "print(rlm_model.summary())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print(rlm_model.weights)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Hertzprung Russell data for Star Cluster CYG 0B1 - Leverage Points" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Data is on the luminosity and temperature of 47 stars in the direction of Cygnus." ] }, { "cell_type": "code", "collapsed": false, "input": [ "dta = sm.datasets.get_rdataset(\"starsCYG\", \"robustbase\", cache=True).data" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from matplotlib.patches import Ellipse\n", "fig = plt.figure(figsize=(12,8))\n", "ax = fig.add_subplot(111, xlabel='log(Temp)', ylabel='log(Light)', title='Hertzsprung-Russell Diagram of Star Cluster CYG OB1')\n", "ax.scatter(*dta.values.T)\n", "# highlight outliers\n", "e = Ellipse((3.5, 6), .2, 1, alpha=.25, color='r')\n", "ax.add_patch(e);\n", "ax.annotate('Red giants', xy=(3.6, 6), xytext=(3.8, 6),\n", " arrowprops=dict(facecolor='black', shrink=0.05, width=2),\n", " horizontalalignment='left', verticalalignment='bottom',\n", " clip_on=True, # clip to the axes bounding box\n", " fontsize=16,\n", " )\n", "# annotate these with their index\n", "for i,row in dta.ix[dta['log.Te'] < 3.8].iterrows():\n", " ax.annotate(i, row, row + .01, fontsize=14)\n", "xlim, ylim = ax.get_xlim(), ax.get_ylim()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from IPython.display import Image\n", "Image(filename='star_diagram.png')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "y = dta['log.light']\n", "X = sm.add_constant(dta['log.Te'], prepend=True)\n", "ols_model = sm.OLS(y, X).fit()\n", "abline_plot(model_results=ols_model, ax=ax)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "rlm_mod = sm.RLM(y, X, sm.robust.norms.TrimmedMean(.5)).fit()\n", "abline_plot(model_results=rlm_mod, ax=ax, color='red')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Why? Because M-estimators are not robust to leverage points." ] }, { "cell_type": "code", "collapsed": false, "input": [ "infl = ols_model.get_influence()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "h_bar = 2*(ols_model.df_model + 1 )/ols_model.nobs\n", "hat_diag = infl.summary_frame()['hat_diag']\n", "hat_diag.ix[hat_diag > h_bar]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "sidak2 = ols_model.outlier_test('sidak')\n", "sidak2.sort('unadj_p', inplace=True)\n", "print(sidak2)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "fdr2 = ols_model.outlier_test('fdr_bh')\n", "fdr2.sort('unadj_p', inplace=True)\n", "print(fdr2)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Let's delete that line" ] }, { "cell_type": "code", "collapsed": false, "input": [ "del ax.lines[-1]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "weights = np.ones(len(X))\n", "weights[X[X['log.Te'] < 3.8].index.values - 1] = 0\n", "wls_model = sm.WLS(y, X, weights=weights).fit()\n", "abline_plot(model_results=wls_model, ax=ax, color='green')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* MM estimators are good for this type of problem, unfortunately, we don't yet have these yet. \n", "* It's being worked on, but it gives a good excuse to look at the R cell magics in the notebook." ] }, { "cell_type": "code", "collapsed": false, "input": [ "yy = y.values[:,None]\n", "xx = X['log.Te'].values[:,None]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%load_ext rmagic\n", "\n", "%R library(robustbase)\n", "%Rpush yy xx\n", "%R mod <- lmrob(yy ~ xx);\n", "%R params <- mod$coefficients;\n", "%Rpull params" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%R print(mod)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print(params)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "abline_plot(intercept=params[0], slope=params[1], ax=ax, color='green')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Exercise: Breakdown points of M-estimator" ] }, { "cell_type": "code", "collapsed": false, "input": [ "np.random.seed(12345)\n", "nobs = 200\n", "beta_true = np.array([3, 1, 2.5, 3, -4])\n", "X = np.random.uniform(-20,20, size=(nobs, len(beta_true)-1))\n", "# stack a constant in front\n", "X = sm.add_constant(X, prepend=True) # np.c_[np.ones(nobs), X]\n", "mc_iter = 500\n", "contaminate = .25 # percentage of response variables to contaminate" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "all_betas = []\n", "for i in range(mc_iter):\n", " y = np.dot(X, beta_true) + np.random.normal(size=200)\n", " random_idx = np.random.randint(0, nobs, size=int(contaminate * nobs))\n", " y[random_idx] = np.random.uniform(-750, 750)\n", " beta_hat = sm.RLM(y, X).fit().params\n", " all_betas.append(beta_hat)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "all_betas = np.asarray(all_betas)\n", "se_loss = lambda x : np.linalg.norm(x, ord=2)**2\n", "se_beta = lmap(se_loss, all_betas - beta_true)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "Squared error loss" ] }, { "cell_type": "code", "collapsed": false, "input": [ "np.array(se_beta).mean()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "all_betas.mean(0)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "beta_true" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "se_loss(all_betas.mean(0) - beta_true)" ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }