{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Multiple Regression.ipynb", "version": "0.3.2", "provenance": [], "collapsed_sections": [], "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "dGNv8xvZv7hm", "colab_type": "text" }, "source": [ "Python version of example from here: https://stats.stackexchange.com/questions/17336/how-exactly-does-one-control-for-other-variables\n", "\n", "R version with step by step explanations: https://colab.research.google.com/drive/1sGCECEFcLCj4T-sUgbl_wPDn_TaREcna" ] }, { "cell_type": "code", "metadata": { "id": "Gpi6-cefm1PU", "colab_type": "code", "outputId": "8390be3c-76aa-4d04-9725-7c5a1373a937", "colab": { "base_uri": "https://localhost:8080/", "height": 36 } }, "source": [ "%pylab inline" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "KuJOl6VWneo1", "colab_type": "code", "colab": {} }, "source": [ "covariate=np.random.randint(0,2,100)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "dvmhCqY5nofX", "colab_type": "code", "colab": {} }, "source": [ " exposure =random.uniform(0,1,(100,))+(0.3*covariate)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "pAjOLoh4ojt1", "colab_type": "code", "colab": {} }, "source": [ "outcome = 2.0+(0.5*exposure)+(0.25*covariate)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "VsJ4rKu1oreN", "colab_type": "code", "outputId": "f59e175a-125b-4847-97f1-900525ada654", "colab": { "base_uri": "https://localhost:8080/", "height": 322 } }, "source": [ "import pandas as pd\n", "from statsmodels.formula.api import ols\n", "\n", "data=pd.DataFrame({'covariate':covariate,'exposure':exposure,'outcome':outcome})\n", "results = ols('outcome ~ exposure',data=data).fit()\n", "results.summary2()" ], "execution_count": 9, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Model: OLS Adj. R-squared: 0.742
Dependent Variable: outcome AIC: -141.5737
Date: 2019-05-02 20:12 BIC: -136.3634
No. Observations: 100 Log-Likelihood: 72.787
Df Model: 1 F-statistic: 285.7
Df Residuals: 98 Prob (F-statistic): 8.36e-31
R-squared: 0.745 Scale: 0.013934
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Coef. Std.Err. t P>|t| [0.025 0.975]
Intercept 2.0231 0.0281 72.0657 0.0000 1.9674 2.0788
exposure 0.6404 0.0379 16.9027 0.0000 0.5652 0.7156
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Omnibus: 521.416 Durbin-Watson: 1.763
Prob(Omnibus): 0.000 Jarque-Bera (JB): 10.336
Skew: 0.087 Prob(JB): 0.006
Kurtosis: 1.435 Condition No.: 5
" ], "text/plain": [ "\n", "\"\"\"\n", " Results: Ordinary least squares\n", "==================================================================\n", "Model: OLS Adj. R-squared: 0.742 \n", "Dependent Variable: outcome AIC: -141.5737\n", "Date: 2019-05-02 20:12 BIC: -136.3634\n", "No. Observations: 100 Log-Likelihood: 72.787 \n", "Df Model: 1 F-statistic: 285.7 \n", "Df Residuals: 98 Prob (F-statistic): 8.36e-31 \n", "R-squared: 0.745 Scale: 0.013934 \n", "--------------------------------------------------------------------\n", " Coef. Std.Err. t P>|t| [0.025 0.975]\n", "--------------------------------------------------------------------\n", "Intercept 2.0231 0.0281 72.0657 0.0000 1.9674 2.0788\n", "exposure 0.6404 0.0379 16.9027 0.0000 0.5652 0.7156\n", "------------------------------------------------------------------\n", "Omnibus: 521.416 Durbin-Watson: 1.763 \n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 10.336\n", "Skew: 0.087 Prob(JB): 0.006 \n", "Kurtosis: 1.435 Condition No.: 5 \n", "==================================================================\n", "\n", "\"\"\"" ] }, "metadata": { "tags": [] }, "execution_count": 9 } ] }, { "cell_type": "code", "metadata": { "id": "8I9P6zRhpaJx", "colab_type": "code", "outputId": "5a3cf579-e66f-4047-c5a4-67797ba965ee", "colab": { "base_uri": "https://localhost:8080/", "height": 344 } }, "source": [ "results = ols('outcome ~ exposure+covariate',data=data).fit()\n", "results.summary2()" ], "execution_count": 10, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Model: OLS Adj. R-squared: 1.000
Dependent Variable: outcome AIC: -6470.3043
Date: 2019-05-02 20:12 BIC: -6462.4888
No. Observations: 100 Log-Likelihood: 3238.2
Df Model: 2 F-statistic: 5.923e+29
Df Residuals: 97 Prob (F-statistic): 0.00
R-squared: 1.000 Scale: 4.5136e-30
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Coef. Std.Err. t P>|t| [0.025 0.975]
Intercept 2.0000 0.0000 3944752018701820.5000 0.0000 2.0000 2.0000
exposure 0.5000 0.0000 686731232963876.1250 0.0000 0.5000 0.5000
covariate 0.2500 0.0000 550037902076897.2500 0.0000 0.2500 0.2500
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Omnibus: 1.500 Durbin-Watson: 0.060
Prob(Omnibus): 0.472 Jarque-Bera (JB): 1.321
Skew: 0.126 Prob(JB): 0.517
Kurtosis: 2.497 Condition No.: 5
" ], "text/plain": [ "\n", "\"\"\"\n", " Results: Ordinary least squares\n", "====================================================================\n", "Model: OLS Adj. R-squared: 1.000 \n", "Dependent Variable: outcome AIC: -6470.3043\n", "Date: 2019-05-02 20:12 BIC: -6462.4888\n", "No. Observations: 100 Log-Likelihood: 3238.2 \n", "Df Model: 2 F-statistic: 5.923e+29 \n", "Df Residuals: 97 Prob (F-statistic): 0.00 \n", "R-squared: 1.000 Scale: 4.5136e-30\n", "--------------------------------------------------------------------\n", " Coef. Std.Err. t P>|t| [0.025 0.975]\n", "--------------------------------------------------------------------\n", "Intercept 2.0000 0.0000 3944752018701820.5000 0.0000 2.0000 2.0000\n", "exposure 0.5000 0.0000 686731232963876.1250 0.0000 0.5000 0.5000\n", "covariate 0.2500 0.0000 550037902076897.2500 0.0000 0.2500 0.2500\n", "--------------------------------------------------------------------\n", "Omnibus: 1.500 Durbin-Watson: 0.060\n", "Prob(Omnibus): 0.472 Jarque-Bera (JB): 1.321\n", "Skew: 0.126 Prob(JB): 0.517\n", "Kurtosis: 2.497 Condition No.: 5 \n", "====================================================================\n", "\n", "\"\"\"" ] }, "metadata": { "tags": [] }, "execution_count": 10 } ] }, { "cell_type": "code", "metadata": { "id": "3zysnMpmr8Ca", "colab_type": "code", "outputId": "39954792-3dc9-42ac-9644-545202a20d35", "colab": { "base_uri": "https://localhost:8080/", "height": 272 } }, "source": [ "import graphviz\n", "from graphviz import Digraph\n", "from ipywidgets import Image\n", "\n", "dot = Digraph()\n", "dot.node('c', 'covariate')\n", "dot.node('e', 'exposure')\n", "dot.node('o', 'outcome')\n", "dot.edges(['ce','co','eo'])\n", "\n", "#print(dot.source)\n", "graphviz.Source(dot)" ], "execution_count": 7, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ], "image/svg+xml": "\n\n\n\n\n\n%3\n\n\n\nc\n\ncovariate\n\n\n\ne\n\nexposure\n\n\n\nc->e\n\n\n\n\n\no\n\noutcome\n\n\n\nc->o\n\n\n\n\n\ne->o\n\n\n\n\n\n" }, "metadata": { "tags": [] }, "execution_count": 7 } ] }, { "cell_type": "code", "metadata": { "id": "hDazH4CTs21x", "colab_type": "code", "outputId": "4c90b67e-791f-4ac2-ae1c-29f8d92df5d3", "colab": { "base_uri": "https://localhost:8080/", "height": 176 } }, "source": [ "# different case. try to simulate, and control (this is a simpler case than the one above)\n", "\n", "dot = Digraph()\n", "dot.node('g', 'gender')\n", "dot.node('h', 'height')\n", "dot.node('t', 'tv time')\n", "dot.edges(['gh','gt'])\n", "graphviz.Source(dot)" ], "execution_count": 8, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ], "image/svg+xml": "\n\n\n\n\n\n%3\n\n\n\ng\n\ngender\n\n\n\nh\n\nheight\n\n\n\ng->h\n\n\n\n\n\nt\n\ntv time\n\n\n\ng->t\n\n\n\n\n\n" }, "metadata": { "tags": [] }, "execution_count": 8 } ] }, { "cell_type": "code", "metadata": { "id": "V9O_NUOMuGdX", "colab_type": "code", "colab": {} }, "source": [ "" ], "execution_count": 0, "outputs": [] } ] }