{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Simple linear models\n", "\n", "- \"model_formulas\" is based on examples in Kaplan \"Statistical Modeling\".\n", "- \"polynomial_regression\" shows how to work with simple design matrices, like MATLAB's \"regress\" command.\n", "\n", "Author: Thomas Haslwanter, Date: Feb-2017" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "%matplotlib inline\n", "import numpy as np\n", "import pandas as pd\n", "from urllib.request import urlopen\n", "from statsmodels.formula.api import ols\n", "import statsmodels.regression.linear_model as sm\n", "from statsmodels.stats.anova import anova_lm" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Define models through formulas" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "# Get the data\n", "inFile = 'swim100m.csv'\n", "url_base = 'https://raw.githubusercontent.com/thomas-haslwanter/statsintro_python/master/ipynb/Data/data_kaplan/'\n", "url = url_base + inFile\n", "data = pd.read_csv(urlopen(url))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "### OLS Model" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: time R-squared: 0.287\n", "Model: OLS Adj. R-squared: 0.275\n", "Method: Least Squares F-statistic: 24.13\n", "Date: Sat, 04 Feb 2017 Prob (F-statistic): 7.28e-06\n", "Time: 14:21:20 Log-Likelihood: -219.23\n", "No. Observations: 62 AIC: 442.5\n", "Df Residuals: 60 BIC: 446.7\n", "Df Model: 1 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "Intercept 65.1923 1.517 42.986 0.000 62.159 68.226\n", "sex[T.M] -10.5361 2.145 -4.912 0.000 -14.826 -6.246\n", "==============================================================================\n", "Omnibus: 16.370 Durbin-Watson: 0.353\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 19.838\n", "Skew: 1.113 Prob(JB): 4.92e-05\n", "Kurtosis: 4.649 Cond. No. 2.62\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" ] } ], "source": [ "# Different models\n", "model1 = ols(\"time ~ sex\", data).fit() # one factor\n", "print(model1.summary())" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "### ANOVA" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " df sum_sq mean_sq F PR(>F)\n", "sex 1.0 1720.655232 1720.655232 24.132575 0.000007\n", "Residual 60.0 4278.006477 71.300108 NaN NaN\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in greater\n", " return (self.a < x) & (x < self.b)\n", "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in less\n", " return (self.a < x) & (x < self.b)\n", "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:1814: RuntimeWarning: invalid value encountered in less_equal\n", " cond2 = cond0 & (x <= self.a)\n" ] } ], "source": [ "print(anova_lm(model1))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: time R-squared: 0.844\n", "Model: OLS Adj. R-squared: 0.839\n", "Method: Least Squares F-statistic: 159.6\n", "Date: Sat, 04 Feb 2017 Prob (F-statistic): 1.58e-24\n", "Time: 14:21:20 Log-Likelihood: -172.12\n", "No. Observations: 62 AIC: 350.2\n", "Df Residuals: 59 BIC: 356.6\n", "Df Model: 2 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "Intercept 555.7168 33.800 16.441 0.000 488.083 623.350\n", "sex[T.M] -9.7980 1.013 -9.673 0.000 -11.825 -7.771\n", "year -0.2515 0.017 -14.516 0.000 -0.286 -0.217\n", "==============================================================================\n", "Omnibus: 52.546 Durbin-Watson: 0.375\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 241.626\n", "Skew: 2.430 Prob(JB): 3.40e-53\n", "Kurtosis: 11.362 Cond. No. 1.30e+05\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "[2] The condition number is large, 1.3e+05. This might indicate that there are\n", "strong multicollinearity or other numerical problems.\n" ] } ], "source": [ "model2 = ols(\"time ~ sex + year\", data).fit() # two factors\n", "print(model2.summary())\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " df sum_sq mean_sq F PR(>F)\n", "sex 1.0 1720.655232 1720.655232 108.479881 5.475511e-15\n", "year 1.0 3342.177104 3342.177104 210.709831 3.935386e-21\n", "Residual 59.0 935.829374 15.861515 NaN NaN\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in greater\n", " return (self.a < x) & (x < self.b)\n", "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in less\n", " return (self.a < x) & (x < self.b)\n", "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:1814: RuntimeWarning: invalid value encountered in less_equal\n", " cond2 = cond0 & (x <= self.a)\n" ] } ], "source": [ "print(anova_lm(model2))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: time R-squared: 0.893\n", "Model: OLS Adj. R-squared: 0.888\n", "Method: Least Squares F-statistic: 162.1\n", "Date: Sat, 04 Feb 2017 Prob (F-statistic): 3.67e-28\n", "Time: 14:21:21 Log-Likelihood: -160.30\n", "No. Observations: 62 AIC: 328.6\n", "Df Residuals: 58 BIC: 337.1\n", "Df Model: 3 \n", "Covariance Type: nonrobust \n", "=================================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "---------------------------------------------------------------------------------\n", "Intercept 697.3012 39.221 17.779 0.000 618.791 775.811\n", "sex[T.M] -302.4638 56.412 -5.362 0.000 -415.384 -189.544\n", "year -0.3240 0.020 -16.118 0.000 -0.364 -0.284\n", "sex[T.M]:year 0.1499 0.029 5.189 0.000 0.092 0.208\n", "==============================================================================\n", "Omnibus: 49.919 Durbin-Watson: 0.575\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 243.008\n", "Skew: 2.224 Prob(JB): 1.70e-53\n", "Kurtosis: 11.619 Cond. No. 3.40e+05\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "[2] The condition number is large, 3.4e+05. This might indicate that there are\n", "strong multicollinearity or other numerical problems.\n" ] } ], "source": [ "model3 = ols(\"time ~ sex * year\", data).fit() # two factors with interaction\n", "print(model3.summary())" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " df sum_sq mean_sq F PR(>F)\n", "sex 1.0 1720.655232 1720.655232 156.140793 4.299569e-18\n", "year 1.0 3342.177104 3342.177104 303.285733 1.039245e-24\n", "sex:year 1.0 296.675432 296.675432 26.921801 2.826421e-06\n", "Residual 58.0 639.153942 11.019896 NaN NaN\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in greater\n", " return (self.a < x) & (x < self.b)\n", "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in less\n", " return (self.a < x) & (x < self.b)\n", "C:\\Programs\\WinPython-64bit-3.6.0.1Qt5\\python-3.6.0.amd64\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py:1814: RuntimeWarning: invalid value encountered in less_equal\n", " cond2 = cond0 & (x <= self.a)\n" ] } ], "source": [ "print(anova_lm(model3))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Polynomial Regression\n", "\n", "Here we define the model directly through the design matrix. Similar to MATLAB's \"regress\" command." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Summary:\n", " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: y R-squared: 0.993\n", "Model: OLS Adj. R-squared: 0.993\n", "Method: Least Squares F-statistic: 7046.\n", "Date: Sat, 04 Feb 2017 Prob (F-statistic): 9.76e-106\n", "Time: 14:21:21 Log-Likelihood: -313.30\n", "No. Observations: 100 AIC: 632.6\n", "Df Residuals: 97 BIC: 640.4\n", "Df Model: 2 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const 3.7463 1.658 2.260 0.026 0.456 7.036\n", "x1 3.1027 0.774 4.009 0.000 1.567 4.639\n", "x2 1.9708 0.076 26.055 0.000 1.821 2.121\n", "==============================================================================\n", "Omnibus: 1.436 Durbin-Watson: 1.845\n", "Prob(Omnibus): 0.488 Jarque-Bera (JB): 1.189\n", "Skew: -0.043 Prob(JB): 0.552\n", "Kurtosis: 2.473 Cond. No. 142.\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" ] } ], "source": [ "# Generate the data\n", "t = np.arange(0,10,0.1)\n", "y = 4 + 3*t + 2*t**2 + 5*np.random.randn(len(t))\n", "\n", "# Make the fit. Note that this is another \"OLS\" than the one in \"model_formulas\"!\n", "M = np.column_stack((np.ones(len(t)), t, t**2))\n", "res = sm.OLS(y, M).fit()\n", " \n", "# Display the results\n", "print('Summary:')\n", "print(res.summary())" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The fit parameters are: [ 3.74631449 3.10268496 1.97082745]\n", "The confidence intervals are:\n", "[[ 0.45628815 7.03634082]\n", " [ 1.56675671 4.6386132 ]\n", " [ 1.82070297 2.12095193]]\n" ] } ], "source": [ "print('The fit parameters are: {0}'.format(str(res.params)))\n", "print('The confidence intervals are:')\n", "print(res.conf_int())" ] } ], "metadata": { "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 0 }