{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Multiple Regression.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "dGNv8xvZv7hm",
"colab_type": "text"
},
"source": [
"Python version of example from here: https://stats.stackexchange.com/questions/17336/how-exactly-does-one-control-for-other-variables\n",
"\n",
"R version with step by step explanations: https://colab.research.google.com/drive/1sGCECEFcLCj4T-sUgbl_wPDn_TaREcna"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Gpi6-cefm1PU",
"colab_type": "code",
"outputId": "8390be3c-76aa-4d04-9725-7c5a1373a937",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 36
}
},
"source": [
"%pylab inline"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "KuJOl6VWneo1",
"colab_type": "code",
"colab": {}
},
"source": [
"covariate=np.random.randint(0,2,100)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "dvmhCqY5nofX",
"colab_type": "code",
"colab": {}
},
"source": [
" exposure =random.uniform(0,1,(100,))+(0.3*covariate)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "pAjOLoh4ojt1",
"colab_type": "code",
"colab": {}
},
"source": [
"outcome = 2.0+(0.5*exposure)+(0.25*covariate)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "VsJ4rKu1oreN",
"colab_type": "code",
"outputId": "f59e175a-125b-4847-97f1-900525ada654",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 322
}
},
"source": [
"import pandas as pd\n",
"from statsmodels.formula.api import ols\n",
"\n",
"data=pd.DataFrame({'covariate':covariate,'exposure':exposure,'outcome':outcome})\n",
"results = ols('outcome ~ exposure',data=data).fit()\n",
"results.summary2()"
],
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"
\n",
"\n",
" Model: | OLS | Adj. R-squared: | 0.742 | \n",
"
\n",
"\n",
" Dependent Variable: | outcome | AIC: | -141.5737 | \n",
"
\n",
"\n",
" Date: | 2019-05-02 20:12 | BIC: | -136.3634 | \n",
"
\n",
"\n",
" No. Observations: | 100 | Log-Likelihood: | 72.787 | \n",
"
\n",
"\n",
" Df Model: | 1 | F-statistic: | 285.7 | \n",
"
\n",
"\n",
" Df Residuals: | 98 | Prob (F-statistic): | 8.36e-31 | \n",
"
\n",
"\n",
" R-squared: | 0.745 | Scale: | 0.013934 | \n",
"
\n",
"
\n",
"\n",
"\n",
" | Coef. | Std.Err. | t | P>|t| | [0.025 | 0.975] | \n",
"
\n",
"\n",
" Intercept | 2.0231 | 0.0281 | 72.0657 | 0.0000 | 1.9674 | 2.0788 | \n",
"
\n",
"\n",
" exposure | 0.6404 | 0.0379 | 16.9027 | 0.0000 | 0.5652 | 0.7156 | \n",
"
\n",
"
\n",
"\n",
"\n",
" Omnibus: | 521.416 | Durbin-Watson: | 1.763 | \n",
"
\n",
"\n",
" Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 10.336 | \n",
"
\n",
"\n",
" Skew: | 0.087 | Prob(JB): | 0.006 | \n",
"
\n",
"\n",
" Kurtosis: | 1.435 | Condition No.: | 5 | \n",
"
\n",
"
"
],
"text/plain": [
"\n",
"\"\"\"\n",
" Results: Ordinary least squares\n",
"==================================================================\n",
"Model: OLS Adj. R-squared: 0.742 \n",
"Dependent Variable: outcome AIC: -141.5737\n",
"Date: 2019-05-02 20:12 BIC: -136.3634\n",
"No. Observations: 100 Log-Likelihood: 72.787 \n",
"Df Model: 1 F-statistic: 285.7 \n",
"Df Residuals: 98 Prob (F-statistic): 8.36e-31 \n",
"R-squared: 0.745 Scale: 0.013934 \n",
"--------------------------------------------------------------------\n",
" Coef. Std.Err. t P>|t| [0.025 0.975]\n",
"--------------------------------------------------------------------\n",
"Intercept 2.0231 0.0281 72.0657 0.0000 1.9674 2.0788\n",
"exposure 0.6404 0.0379 16.9027 0.0000 0.5652 0.7156\n",
"------------------------------------------------------------------\n",
"Omnibus: 521.416 Durbin-Watson: 1.763 \n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 10.336\n",
"Skew: 0.087 Prob(JB): 0.006 \n",
"Kurtosis: 1.435 Condition No.: 5 \n",
"==================================================================\n",
"\n",
"\"\"\""
]
},
"metadata": {
"tags": []
},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "8I9P6zRhpaJx",
"colab_type": "code",
"outputId": "5a3cf579-e66f-4047-c5a4-67797ba965ee",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 344
}
},
"source": [
"results = ols('outcome ~ exposure+covariate',data=data).fit()\n",
"results.summary2()"
],
"execution_count": 10,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
" Model: | OLS | Adj. R-squared: | 1.000 | \n",
"
\n",
"\n",
" Dependent Variable: | outcome | AIC: | -6470.3043 | \n",
"
\n",
"\n",
" Date: | 2019-05-02 20:12 | BIC: | -6462.4888 | \n",
"
\n",
"\n",
" No. Observations: | 100 | Log-Likelihood: | 3238.2 | \n",
"
\n",
"\n",
" Df Model: | 2 | F-statistic: | 5.923e+29 | \n",
"
\n",
"\n",
" Df Residuals: | 97 | Prob (F-statistic): | 0.00 | \n",
"
\n",
"\n",
" R-squared: | 1.000 | Scale: | 4.5136e-30 | \n",
"
\n",
"
\n",
"\n",
"\n",
" | Coef. | Std.Err. | t | P>|t| | [0.025 | 0.975] | \n",
"
\n",
"\n",
" Intercept | 2.0000 | 0.0000 | 3944752018701820.5000 | 0.0000 | 2.0000 | 2.0000 | \n",
"
\n",
"\n",
" exposure | 0.5000 | 0.0000 | 686731232963876.1250 | 0.0000 | 0.5000 | 0.5000 | \n",
"
\n",
"\n",
" covariate | 0.2500 | 0.0000 | 550037902076897.2500 | 0.0000 | 0.2500 | 0.2500 | \n",
"
\n",
"
\n",
"\n",
"\n",
" Omnibus: | 1.500 | Durbin-Watson: | 0.060 | \n",
"
\n",
"\n",
" Prob(Omnibus): | 0.472 | Jarque-Bera (JB): | 1.321 | \n",
"
\n",
"\n",
" Skew: | 0.126 | Prob(JB): | 0.517 | \n",
"
\n",
"\n",
" Kurtosis: | 2.497 | Condition No.: | 5 | \n",
"
\n",
"
"
],
"text/plain": [
"\n",
"\"\"\"\n",
" Results: Ordinary least squares\n",
"====================================================================\n",
"Model: OLS Adj. R-squared: 1.000 \n",
"Dependent Variable: outcome AIC: -6470.3043\n",
"Date: 2019-05-02 20:12 BIC: -6462.4888\n",
"No. Observations: 100 Log-Likelihood: 3238.2 \n",
"Df Model: 2 F-statistic: 5.923e+29 \n",
"Df Residuals: 97 Prob (F-statistic): 0.00 \n",
"R-squared: 1.000 Scale: 4.5136e-30\n",
"--------------------------------------------------------------------\n",
" Coef. Std.Err. t P>|t| [0.025 0.975]\n",
"--------------------------------------------------------------------\n",
"Intercept 2.0000 0.0000 3944752018701820.5000 0.0000 2.0000 2.0000\n",
"exposure 0.5000 0.0000 686731232963876.1250 0.0000 0.5000 0.5000\n",
"covariate 0.2500 0.0000 550037902076897.2500 0.0000 0.2500 0.2500\n",
"--------------------------------------------------------------------\n",
"Omnibus: 1.500 Durbin-Watson: 0.060\n",
"Prob(Omnibus): 0.472 Jarque-Bera (JB): 1.321\n",
"Skew: 0.126 Prob(JB): 0.517\n",
"Kurtosis: 2.497 Condition No.: 5 \n",
"====================================================================\n",
"\n",
"\"\"\""
]
},
"metadata": {
"tags": []
},
"execution_count": 10
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "3zysnMpmr8Ca",
"colab_type": "code",
"outputId": "39954792-3dc9-42ac-9644-545202a20d35",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 272
}
},
"source": [
"import graphviz\n",
"from graphviz import Digraph\n",
"from ipywidgets import Image\n",
"\n",
"dot = Digraph()\n",
"dot.node('c', 'covariate')\n",
"dot.node('e', 'exposure')\n",
"dot.node('o', 'outcome')\n",
"dot.edges(['ce','co','eo'])\n",
"\n",
"#print(dot.source)\n",
"graphviz.Source(dot)"
],
"execution_count": 7,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
""
],
"image/svg+xml": "\n\n\n\n\n"
},
"metadata": {
"tags": []
},
"execution_count": 7
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "hDazH4CTs21x",
"colab_type": "code",
"outputId": "4c90b67e-791f-4ac2-ae1c-29f8d92df5d3",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 176
}
},
"source": [
"# different case. try to simulate, and control (this is a simpler case than the one above)\n",
"\n",
"dot = Digraph()\n",
"dot.node('g', 'gender')\n",
"dot.node('h', 'height')\n",
"dot.node('t', 'tv time')\n",
"dot.edges(['gh','gt'])\n",
"graphviz.Source(dot)"
],
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
""
],
"image/svg+xml": "\n\n\n\n\n"
},
"metadata": {
"tags": []
},
"execution_count": 8
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "V9O_NUOMuGdX",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 0,
"outputs": []
}
]
}