{ "metadata": { "name": "", "signature": "sha256:3d56dcbfbf79a5d9735e5240f044566b3157072f90f085a06c84b6ee3e94ae70" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Prediction (out of sample)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from __future__ import print_function\n", "import numpy as np\n", "import statsmodels.api as sm" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Artificial data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "nsample = 50\n", "sig = 0.25\n", "x1 = np.linspace(0, 20, nsample)\n", "X = np.column_stack((x1, np.sin(x1), (x1-5)**2))\n", "X = sm.add_constant(X)\n", "beta = [5., 0.5, 0.5, -0.02]\n", "y_true = np.dot(X, beta)\n", "y = y_true + sig * np.random.normal(size=nsample)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Estimation " ] }, { "cell_type": "code", "collapsed": false, "input": [ "olsmod = sm.OLS(y, X)\n", "olsres = olsmod.fit()\n", "print(olsres.summary())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## In-sample prediction" ] }, { "cell_type": "code", "collapsed": false, "input": [ "ypred = olsres.predict(X)\n", "print(ypred)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create a new sample of explanatory variables Xnew, predict and plot" ] }, { "cell_type": "code", "collapsed": false, "input": [ "x1n = np.linspace(20.5,25, 10)\n", "Xnew = np.column_stack((x1n, np.sin(x1n), (x1n-5)**2))\n", "Xnew = sm.add_constant(Xnew)\n", "ynewpred = olsres.predict(Xnew) # predict out of sample\n", "print(ynewpred)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot comparison" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import matplotlib.pyplot as plt\n", "\n", "fig, ax = plt.subplots()\n", "ax.plot(x1, y, 'o', label=\"Data\")\n", "ax.plot(x1, y_true, 'b-', label=\"True\")\n", "ax.plot(np.hstack((x1, x1n)), np.hstack((ypred, ynewpred)), 'r', label=\"OLS prediction\")\n", "ax.legend(loc=\"best\");" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Predicting with Formulas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using formulas can make both estimation and prediction a lot easier" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from statsmodels.formula.api import ols\n", "\n", "data = {\"x1\" : x1, \"y\" : y}\n", "\n", "res = ols(\"y ~ x1 + np.sin(x1) + I((x1-5)**2)\", data=data).fit()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We use the `I` to indicate use of the Identity transform. Ie., we don't want any expansion magic from using `**2`" ] }, { "cell_type": "code", "collapsed": false, "input": [ "res.params" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we only have to pass the single variable and we get the transformed right-hand side variables automatically" ] }, { "cell_type": "code", "collapsed": false, "input": [ "res.predict(exog=dict(x1=x1n))" ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }