{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] } ], "source": [ "%pylab inline\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Logistic Regression = Binomial regression with logit function\n", "\n", "This notebook shows (empirically) that performing a logistic regression for\n", "binary data is equivalent to a binomial regression with logit link." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
xy
01.6911
11.6911
21.6911
31.6911
41.6911
.........
4761.8841
4771.8841
4781.8841
4791.8841
4801.8841
\n", "

481 rows × 2 columns

\n", "
" ], "text/plain": [ " x y\n", "0 1.691 1\n", "1 1.691 1\n", "2 1.691 1\n", "3 1.691 1\n", "4 1.691 1\n", ".. ... ..\n", "476 1.884 1\n", "477 1.884 1\n", "478 1.884 1\n", "479 1.884 1\n", "480 1.884 1\n", "\n", "[481 rows x 2 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ungrouped_data = pd.read_csv(\"../data/Beetles.dat\", sep=\"\\t\")\n", "ungrouped_data" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
logdosendeadalive
01.69159653
11.724601347
21.755621844
31.784562828
41.811635211
51.83759536
61.86162611
71.88460600
\n", "
" ], "text/plain": [ " logdose n dead alive\n", "0 1.691 59 6 53\n", "1 1.724 60 13 47\n", "2 1.755 62 18 44\n", "3 1.784 56 28 28\n", "4 1.811 63 52 11\n", "5 1.837 59 53 6\n", "6 1.861 62 61 1\n", "7 1.884 60 60 0" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grouped_data = pd.read_csv(\"../data/Beetles2.dat\", sep= \"\\t\")\n", "grouped_data['alive'] = grouped_data['n'] - grouped_data['dead']\n", "grouped_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Logistic" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 0.387063\n", " Iterations 7\n" ] } ], "source": [ "import statsmodels.api as sm\n", "#spector_data.exog = sm.add_constant(spector_data.exog)\n", "logit_mod = sm.Logit(ungrouped_data['y'], sm.add_constant(ungrouped_data['x']))\n", "logit_res = logit_mod.fit()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Logit Regression Results \n", "==============================================================================\n", "Dep. Variable: y No. Observations: 481\n", "Model: Logit Df Residuals: 479\n", "Method: MLE Df Model: 1\n", "Date: Fri, 07 Feb 2020 Pseudo R-squ.: 0.4231\n", "Time: 16:09:36 Log-Likelihood: -186.18\n", "converged: True LL-Null: -322.72\n", "Covariance Type: nonrobust LLR p-value: 2.411e-61\n", "==============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -60.7401 5.182 -11.722 0.000 -70.896 -50.584\n", "x 34.2859 2.913 11.769 0.000 28.576 39.996\n", "==============================================================================\n" ] } ], "source": [ "print(logit_res.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Binomial" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Generalized Linear Model Regression Results \n", "==============================================================================\n", "Dep. Variable: y No. Observations: 8\n", "Model: GLM Df Residuals: 6\n", "Model Family: Binomial Df Model: 1\n", "Link Function: logit Scale: 1.0000\n", "Method: IRLS Log-Likelihood: -2.1699\n", "Date: Fri, 07 Feb 2020 Deviance: 0.18673\n", "Time: 16:09:58 Pearson chi2: 0.166\n", "No. Iterations: 5 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -60.4819 40.062 -1.510 0.131 -139.002 18.039\n", "logdose 34.1370 22.522 1.516 0.130 -10.005 78.279\n", "==============================================================================\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Calling Family(..) with a link class as argument is deprecated.\n", "Use an instance of a link class instead.\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "glm_model = sm.GLM(grouped_data['dead']/grouped_data['n'], sm.add_constant(grouped_data['logdose']), family=sm.families.Binomial(link=sm.families.links.logit),\n", " weights = grouped_data['n'])\n", "glm_fit = glm_model.fit()\n", "print(glm_fit.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The above is an abuse of statsmodels. I don't even know why it works." ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Generalized Linear Model Regression Results \n", "==============================================================================\n", "Dep. Variable: ['dead', 'alive'] No. Observations: 8\n", "Model: GLM Df Residuals: 6\n", "Model Family: Binomial Df Model: 1\n", "Link Function: logit Scale: 1.0000\n", "Method: IRLS Log-Likelihood: -18.657\n", "Date: Fri, 07 Feb 2020 Deviance: 11.116\n", "Time: 17:25:29 Pearson chi2: 9.91\n", "No. Iterations: 6 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -60.7401 5.182 -11.722 0.000 -70.896 -50.584\n", "logdose 34.2859 2.913 11.769 0.000 28.576 39.996\n", "==============================================================================\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Calling Family(..) with a link class as argument is deprecated.\n", "Use an instance of a link class instead.\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "glm_model = sm.GLM(grouped_data[['dead', 'alive']] , sm.add_constant(grouped_data['logdose']), family=sm.families.Binomial(link=sm.families.links.logit),\n", " )\n", "glm_fit = glm_model.fit()\n", "print(glm_fit.summary())" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Generalized Linear Model Regression Results \n", "==============================================================================\n", "Dep. Variable: y No. Observations: 481\n", "Model: GLM Df Residuals: 479\n", "Model Family: Binomial Df Model: 1\n", "Link Function: logit Scale: 1.0000\n", "Method: IRLS Log-Likelihood: -186.18\n", "Date: Fri, 07 Feb 2020 Deviance: 372.35\n", "Time: 12:45:10 Pearson chi2: 436.\n", "No. Iterations: 6 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -60.7401 5.182 -11.722 0.000 -70.896 -50.584\n", "x 34.2859 2.913 11.769 0.000 28.576 39.996\n", "==============================================================================\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Calling Family(..) with a link class as argument is deprecated.\n", "Use an instance of a link class instead.\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "glm_model = sm.GLM(grouped_data['y'], sm.add_constant(grouped_data['x']), family=sm.families.Binomial(link=sm.families.links.logit))\n", "glm_fit = glm_model.fit()\n", "print(glm_fit.summary())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python (riboraptor)", "language": "python", "name": "riboraptor" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }