{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Regression Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import statsmodels.formula.api as sm\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from scipy import stats"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Get the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.text.Text at 0x846ccd0>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_str = '''Region Alcohol Tobacco\n",
    "North 6.47 4.03\n",
    "Yorkshire 6.13 3.76\n",
    "Northeast 6.19 3.77\n",
    "East_Midlands 4.89 3.34\n",
    "West_Midlands 5.63 3.47\n",
    "East_Anglia 4.52 2.92\n",
    "Southeast 5.89 3.20\n",
    "Southwest 4.79 2.71\n",
    "Wales 5.27 3.53\n",
    "Scotland 6.08 4.51\n",
    "Northern_Ireland 4.02 4.56'''\n",
    "\n",
    "# Read in the data. Note that for Python 2.x, you have to change the \"import\" statement\n",
    "from io import StringIO\n",
    "df = pd.read_csv(StringIO(data_str), sep=r'\\s+')\n",
    "\n",
    "# Plot the data\n",
    "df.plot('Tobacco', 'Alcohol', style='o')\n",
    "plt.ylabel('Alcohol')\n",
    "plt.title('Sales in Several UK Regions')\n",
    "#plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Show Regression Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                Alcohol   R-squared:                       0.615\n",
      "Model:                            OLS   Adj. R-squared:                  0.567\n",
      "Method:                 Least Squares   F-statistic:                     12.78\n",
      "Date:                Sun, 01 Nov 2015   Prob (F-statistic):            0.00723\n",
      "Time:                        18:30:14   Log-Likelihood:                -4.9998\n",
      "No. Observations:                  10   AIC:                             14.00\n",
      "Df Residuals:                       8   BIC:                             14.60\n",
      "Df Model:                           1                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "==============================================================================\n",
      "                 coef    std err          t      P>|t|      [95.0% Conf. Int.]\n",
      "------------------------------------------------------------------------------\n",
      "Intercept      2.0412      1.001      2.038      0.076        -0.268     4.350\n",
      "Tobacco        1.0059      0.281      3.576      0.007         0.357     1.655\n",
      "==============================================================================\n",
      "Omnibus:                        2.542   Durbin-Watson:                   1.975\n",
      "Prob(Omnibus):                  0.281   Jarque-Bera (JB):                0.904\n",
      "Skew:                          -0.014   Prob(JB):                        0.636\n",
      "Kurtosis:                       1.527   Cond. No.                         27.2\n",
      "==============================================================================\n",
      "\n",
      "Warnings:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\WinPython-32bit-3.4.3.6\\python-3.4.3\\lib\\site-packages\\scipy\\stats\\stats.py:1285: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=10\n",
      "  \"anyway, n=%i\" % int(n))\n"
     ]
    }
   ],
   "source": [
    "result = sm.ols('Alcohol ~ Tobacco', df[:-1]).fit()\n",
    "print(result.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Model Parameters"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### F-Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "F-statistic: 12.785,  p-value: 0.00723\n"
     ]
    }
   ],
   "source": [
    "N = result.nobs\n",
    "k = result.df_model+1\n",
    "dfm, dfe = k-1, N - k\n",
    "F = result.mse_model / result.mse_resid\n",
    "p = 1.0 - stats.f.cdf(F,dfm,dfe)\n",
    "print('F-statistic: {:.3f},  p-value: {:.5f}'.format( F, p ))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### Likelihood"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ln(L) = -4.99975869739\n"
     ]
    }
   ],
   "source": [
    "N = result.nobs\n",
    "SSR = result.ssr\n",
    "s2 = SSR / N\n",
    "L = ( 1.0/np.sqrt(2*np.pi*s2) ) ** N * np.exp( -SSR/(s2*2.0) )\n",
    "print('ln(L) =', np.log( L ))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "## Coefficients"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Intercept    2.041223\n",
      "Tobacco      1.005896\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print(result.params)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### Standard Error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "# Standard Errors\n",
    "df['Eins'] = np.ones(( len(df), ))\n",
    "Y = df.Alcohol[:-1]\n",
    "X = df[['Tobacco','Eins']][:-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 1.00136021         nan]\n",
      " [        nan  0.28132158]]\n"
     ]
    }
   ],
   "source": [
    "X = df.Tobacco[:-1]\n",
    "\n",
    "# add a column of ones for the constant intercept term\n",
    "X = np.vstack(( np.ones(X.size), X ))\n",
    "\n",
    "# convert the NumPy arrray to matrix\n",
    "X = np.matrix( X )\n",
    "\n",
    "# perform the matrix multiplication,\n",
    "# and then take the inverse\n",
    "C = np.linalg.inv( X * X.T )\n",
    "\n",
    "# multiply by the MSE of the residual\n",
    "C *= result.mse_resid\n",
    "\n",
    "# take the square root\n",
    "SE = np.sqrt(C)\n",
    "\n",
    "print(SE)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### T-statistic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "t = 3.57560845424\n"
     ]
    }
   ],
   "source": [
    "i = 1\n",
    "beta = result.params[i]\n",
    "se = SE[i,i]\n",
    "t = beta / se\n",
    "print('t =', t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "p = 0.007\n"
     ]
    }
   ],
   "source": [
    "N = result.nobs\n",
    "k = result.df_model + 1\n",
    "dof = N - k\n",
    "p_onesided = 1.0 - stats.t( dof ).cdf( t )\n",
    "p = p_onesided * 2.0\n",
    "print('p = {0:.3f}'.format(p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### Confidence Intervals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-0.267917709371 4.35036388305\n"
     ]
    }
   ],
   "source": [
    "i = 0\n",
    "\n",
    "# the estimated coefficient, and its variance\n",
    "beta, c = result.params[i], SE[i,i]\n",
    "\n",
    "# critical value of the t-statistic\n",
    "N = result.nobs\n",
    "P = result.df_model\n",
    "dof = N - P - 1\n",
    "z = stats.t( dof ).ppf(0.975)\n",
    "\n",
    "# the confidence interval\n",
    "print(beta - z * c, beta + z * c)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Analysis of Residuals"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### Skew and Kurtosis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Skewness: -0.014,  Kurtosis: 1.527\n"
     ]
    }
   ],
   "source": [
    "d = Y - result.fittedvalues\n",
    "S = np.mean( d**3.0 ) / np.mean( d**2.0 )**(3.0/2.0)\n",
    "K = np.mean( d**4.0 ) / np.mean( d**2.0 )**(4.0/2.0)\n",
    "print('Skewness: {:.3f},  Kurtosis: {:.3f}'.format( S, K ))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### Omnibus Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Omnibus: 2.5418981690649516\n",
      "Pr( Omnibus ) = 0.2805652152710648\n",
      "Omnibus: 2.5418981690649542, p = 0.28056521527106454\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\WinPython-32bit-3.4.3.6\\python-3.4.3\\lib\\site-packages\\scipy\\stats\\stats.py:1285: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=10\n",
      "  \"anyway, n=%i\" % int(n))\n"
     ]
    }
   ],
   "source": [
    "def Z1( s, n ):\n",
    "    Y = s * np.sqrt( ( ( n + 1 )*( n + 3 ) ) / ( 6.0 * ( n - 2.0 ) ) )\n",
    "    b = 3.0 * ( n**2.0 + 27.0*n - 70 )*( n + 1.0 )*( n + 3.0 )\n",
    "    b /= ( n - 2.0 )*( n + 5.0 )*( n + 7.0 )*( n + 9.0 )\n",
    "    W2 = - 1.0 + np.sqrt( 2.0 * ( b - 1.0 ) )\n",
    "    alpha = np.sqrt( 2.0 / ( W2 - 1.0 ) )\n",
    "    z = 1.0 / np.sqrt( np.log( np.sqrt( W2 ) ) )\n",
    "    z *= np.log( Y / alpha + np.sqrt( ( Y / alpha )**2.0 + 1.0 ) )\n",
    "    return z\n",
    "\n",
    "def Z2( k, n ):\n",
    "    E = 3.0 * ( n - 1.0 ) / ( n + 1.0 )\n",
    "    v = 24.0 * n * ( n - 2.0 )*( n - 3.0 )\n",
    "    v /= ( n + 1.0 )**2.0*( n + 3.0 )*( n + 5.0 )\n",
    "    X = ( k - E ) / np.sqrt( v )\n",
    "    b = ( 6.0 * ( n**2.0 - 5.0*n + 2.0 ) ) / ( ( n + 7.0 )*( n + 9.0 ) )\n",
    "    b *= np.sqrt( ( 6.0 * ( n + 3.0 )*( n + 5.0 ) ) / ( n * ( n - 2.0 )*( n - 3.0 ) ) )\n",
    "    A = 6.0 + ( 8.0 / b )*( 2.0 / b + np.sqrt( 1.0 + 4.0 / b**2.0 ) )\n",
    "    z = ( 1.0 - 2.0 / A ) / ( 1.0 + X * np.sqrt( 2.0 / ( A - 4.0 ) ) )\n",
    "    z = ( 1.0 - 2.0 / ( 9.0 * A ) ) - z**(1.0/3.0)\n",
    "    z /= np.sqrt( 2.0 / ( 9.0 * A ) )\n",
    "    return z\n",
    "\n",
    "K2 = Z1( S, N )**2.0 + Z2( K, N )**2.0\n",
    "print('Omnibus: {}'.format( K2))\n",
    "\n",
    "p = 1.0 - stats.chi2(2).cdf( K2 )\n",
    "print('Pr( Omnibus ) = {}'.format( p ))\n",
    "\n",
    "(K2, p) = stats.normaltest(result.resid)\n",
    "print('Omnibus: {0}, p = {1}'.format(K2, p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### Durbin Watson"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Durbin-Watson: 1.97535\n"
     ]
    }
   ],
   "source": [
    "DW = np.sum( np.diff( result.resid.values )**2.0 ) / result.ssr\n",
    "print('Durbin-Watson: {:.5f}'.format( DW ))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### Jarque-Bera Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "JB-statistic: 0.90421,  p-value: 0.63629\n"
     ]
    }
   ],
   "source": [
    "JB = (N/6.0) * ( S**2.0 + (1.0/4.0)*( K - 3.0 )**2.0 )\n",
    "p = 1.0 - stats.chi2(2).cdf(JB)\n",
    "print('JB-statistic: {:.5f},  p-value: {:.5f}'.format( JB, p ))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### Condition Number"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([ 136.51527115,    0.18412885]), matrix([[ 0.96332746, -0.26832855],\n",
      "        [ 0.26832855,  0.96332746]]))\n"
     ]
    }
   ],
   "source": [
    "X = df.Tobacco[:-1]\n",
    " \n",
    "# add a column of ones for the constant intercept term\n",
    "X = np.vstack(( X, np.ones( X.size ) ))\n",
    " \n",
    "# convert the NumPy arrray to matrix\n",
    "X = np.matrix( X )\n",
    "EV = np.linalg.eig( X * X.T )\n",
    "print(EV)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Condition No.: 27.22887\n"
     ]
    }
   ],
   "source": [
    "CN = np.sqrt( EV[0].max() / EV[0].min() )\n",
    "print('Condition No.: {:.5f}'.format( CN ))"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Slideshow",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}