{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "logistic_regression1.ipynb", "provenance": [], "collapsed_sections": [], "authorship_tag": "ABX9TyNol4fUXTtv3ctDtoUC90S4", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "source": [ "# using sklearn.datasets.fetch_openml\n", "https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_openml.html" ], "metadata": { "id": "kEifD1ODpzuN" } }, { "cell_type": "code", "source": [ "from sklearn.datasets import fetch_openml\n", "import pandas as pd\n", "import numpy as np\n", "import statsmodels.formula.api as smf\n", "import statsmodels.api as sm" ], "metadata": { "id": "6XjTAYh-3v-T" }, "execution_count": 70, "outputs": [] }, { "cell_type": "code", "source": [ "X, y = fetch_openml(\"titanic\", version=1, as_frame=True, return_X_y=True)\n", "type(X)" ], "metadata": { "id": "3ki8YUTB3wA3", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "da0cf4fa-ca0e-4a7c-b8af-8a043d21a390" }, "execution_count": 40, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "pandas.core.frame.DataFrame" ] }, "metadata": {}, "execution_count": 40 } ] }, { "cell_type": "code", "source": [ "X.info()" ], "metadata": { "id": "fwkV4pfN6vvW", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "44dc72e3-6312-4ab9-c632-f2bdf5d954b6" }, "execution_count": 41, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 1309 entries, 0 to 1308\n", "Data columns (total 13 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 pclass 1309 non-null float64 \n", " 1 name 1309 non-null object \n", " 2 sex 1309 non-null category\n", " 3 age 1046 non-null float64 \n", " 4 sibsp 1309 non-null float64 \n", " 5 parch 1309 non-null float64 \n", " 6 ticket 1309 non-null object \n", " 7 fare 1308 non-null float64 \n", " 8 cabin 295 non-null object \n", " 9 embarked 1307 non-null category\n", " 10 boat 486 non-null object \n", " 11 body 121 non-null float64 \n", " 12 home.dest 745 non-null object \n", "dtypes: category(2), float64(6), object(5)\n", "memory usage: 115.4+ KB\n" ] } ] }, { "cell_type": "markdown", "source": [ "#### https://pandas.pydata.org/docs/reference/series.html" ], "metadata": { "id": "2aGAXejfmKPW" } }, { "cell_type": "code", "source": [ "y.dtype" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Mldn3eA_jB-0", "outputId": "375cac45-9be3-41cc-ee49-4395cc209930" }, "execution_count": 42, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "CategoricalDtype(categories=['0', '1'], ordered=False)" ] }, "metadata": {}, "execution_count": 42 } ] }, { "cell_type": "code", "source": [ "y.attrs" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QVEFjcw8jsxl", "outputId": "84a379d9-e6fe-4b4e-9048-ed42020acffb" }, "execution_count": 43, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{}" ] }, "metadata": {}, "execution_count": 43 } ] }, { "cell_type": "code", "source": [ "y.values" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wIqqASVLj0cz", "outputId": "23896929-44bf-47aa-90ce-cf43c1419245" }, "execution_count": 44, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['1', '1', '0', '0', '0', ..., '0', '0', '0', '0', '0']\n", "Length: 1309\n", "Categories (2, object): ['0', '1']" ] }, "metadata": {}, "execution_count": 44 } ] }, { "cell_type": "code", "source": [ "y" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mYwwvdHyj0a0", "outputId": "6f75b9af-3e20-47cf-b99f-b891e147afd1" }, "execution_count": 45, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0 1\n", "1 1\n", "2 0\n", "3 0\n", "4 0\n", " ..\n", "1304 0\n", "1305 0\n", "1306 0\n", "1307 0\n", "1308 0\n", "Name: survived, Length: 1309, dtype: category\n", "Categories (2, object): ['0', '1']" ] }, "metadata": {}, "execution_count": 45 } ] }, { "cell_type": "code", "source": [ "y.index" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "UllJQGC-j0Xg", "outputId": "0c0b4a99-f6d4-421e-a745-6f8ebb12484b" }, "execution_count": 46, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "RangeIndex(start=0, stop=1309, step=1)" ] }, "metadata": {}, "execution_count": 46 } ] }, { "cell_type": "code", "source": [ "y.empty" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_TeH1a3Wj0Uq", "outputId": "8774fdac-a6bd-4c36-cdb3-5b6fd70b76cf" }, "execution_count": 47, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "False" ] }, "metadata": {}, "execution_count": 47 } ] }, { "cell_type": "code", "source": [ "X.columns" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ZsOlTkY8j0SH", "outputId": "77972782-7fa9-486a-bf60-33fa0cf758e2" }, "execution_count": 48, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare',\n", " 'cabin', 'embarked', 'boat', 'body', 'home.dest'],\n", " dtype='object')" ] }, "metadata": {}, "execution_count": 48 } ] }, { "cell_type": "code", "source": [ "y.name" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "gjto5wXAlaz0", "outputId": "bf0605ef-f51b-4f43-d16f-023fcc0e2f1f" }, "execution_count": 49, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'survived'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 49 } ] }, { "cell_type": "code", "source": [ "y.flags" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8WpS-3jPlwUk", "outputId": "5e95475d-295d-4744-d873-368e96aa29e1" }, "execution_count": 50, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 50 } ] }, { "cell_type": "code", "source": [ "y.astype" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "x5eARr8slwOT", "outputId": "5e4c3048-2829-4f7e-a2c5-2adc5ea0f307" }, "execution_count": 51, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 51 } ] }, { "cell_type": "code", "source": [ "y.rank" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KEoW4vo-lwK2", "outputId": "6f59b752-80eb-44eb-86e5-c6a8728ac31a" }, "execution_count": 52, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 52 } ] }, { "cell_type": "code", "source": [ "y.type" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 287 }, "id": "3yA0q1WPlwIL", "outputId": "1e8dc6e0-4623-4fa4-a7f2-0e76a7ef5296" }, "execution_count": 53, "outputs": [ { "output_type": "error", "ename": "AttributeError", "evalue": "ignored", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0my\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 5485\u001b[0m ):\n\u001b[1;32m 5486\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5487\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5488\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5489\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mAttributeError\u001b[0m: 'Series' object has no attribute 'type'" ] } ] }, { "cell_type": "code", "source": [ "y.astype" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ypponXlIlwFf", "outputId": "543afd00-0855-4a20-b8c3-5886ed6ad1c9" }, "execution_count": 54, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 54 } ] }, { "cell_type": "code", "source": [ "type(y)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DYtJdVy-jNMN", "outputId": "42f32465-e6a5-422e-f8a8-84ac828a69a9" }, "execution_count": 55, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "pandas.core.series.Series" ] }, "metadata": {}, "execution_count": 55 } ] }, { "cell_type": "code", "source": [ "df = pd.merge(X, y, right_index = True, left_index = True)\n", "print(df)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lAKIOOl_kbSw", "outputId": "3e7323a8-df0d-4fd6-b37f-fd81de085e55" }, "execution_count": 56, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " pclass name sex \\\n", "0 1.0 Allen, Miss. Elisabeth Walton female \n", "1 1.0 Allison, Master. Hudson Trevor male \n", "2 1.0 Allison, Miss. Helen Loraine female \n", "3 1.0 Allison, Mr. Hudson Joshua Creighton male \n", "4 1.0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female \n", "... ... ... ... \n", "1304 3.0 Zabour, Miss. Hileni female \n", "1305 3.0 Zabour, Miss. Thamine female \n", "1306 3.0 Zakarian, Mr. Mapriededer male \n", "1307 3.0 Zakarian, Mr. Ortin male \n", "1308 3.0 Zimmerman, Mr. Leo male \n", "\n", " age sibsp parch ticket fare cabin embarked boat body \\\n", "0 29.0000 0.0 0.0 24160 211.3375 B5 S 2 NaN \n", "1 0.9167 1.0 2.0 113781 151.5500 C22 C26 S 11 NaN \n", "2 2.0000 1.0 2.0 113781 151.5500 C22 C26 S None NaN \n", "3 30.0000 1.0 2.0 113781 151.5500 C22 C26 S None 135.0 \n", "4 25.0000 1.0 2.0 113781 151.5500 C22 C26 S None NaN \n", "... ... ... ... ... ... ... ... ... ... \n", "1304 14.5000 1.0 0.0 2665 14.4542 None C None 328.0 \n", "1305 NaN 1.0 0.0 2665 14.4542 None C None NaN \n", "1306 26.5000 0.0 0.0 2656 7.2250 None C None 304.0 \n", "1307 27.0000 0.0 0.0 2670 7.2250 None C None NaN \n", "1308 29.0000 0.0 0.0 315082 7.8750 None S None NaN \n", "\n", " home.dest survived \n", "0 St Louis, MO 1 \n", "1 Montreal, PQ / Chesterville, ON 1 \n", "2 Montreal, PQ / Chesterville, ON 0 \n", "3 Montreal, PQ / Chesterville, ON 0 \n", "4 Montreal, PQ / Chesterville, ON 0 \n", "... ... ... \n", "1304 None 0 \n", "1305 None 0 \n", "1306 None 0 \n", "1307 None 0 \n", "1308 None 0 \n", "\n", "[1309 rows x 14 columns]\n" ] } ] }, { "cell_type": "code", "source": [ "df.info()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "o7r2T528kbQD", "outputId": "b8a73bbf-9a1d-4dbe-f711-c787428391af" }, "execution_count": 57, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 1309 entries, 0 to 1308\n", "Data columns (total 14 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 pclass 1309 non-null float64 \n", " 1 name 1309 non-null object \n", " 2 sex 1309 non-null category\n", " 3 age 1046 non-null float64 \n", " 4 sibsp 1309 non-null float64 \n", " 5 parch 1309 non-null float64 \n", " 6 ticket 1309 non-null object \n", " 7 fare 1308 non-null float64 \n", " 8 cabin 295 non-null object \n", " 9 embarked 1307 non-null category\n", " 10 boat 486 non-null object \n", " 11 body 121 non-null float64 \n", " 12 home.dest 745 non-null object \n", " 13 survived 1309 non-null category\n", "dtypes: category(3), float64(6), object(5)\n", "memory usage: 116.8+ KB\n" ] } ] }, { "cell_type": "code", "source": [ "df.index" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "v9GJ5gPDkbNP", "outputId": "d416db50-1c0b-4e6b-b88e-9bc00068e4fe" }, "execution_count": 58, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "RangeIndex(start=0, stop=1309, step=1)" ] }, "metadata": {}, "execution_count": 58 } ] }, { "cell_type": "markdown", "metadata": { "id": "2ss6meclqIKn" }, "source": [ "$$ logit(Survival) = \\alpha + \\beta_{1}*PClass + \\beta_3*Age + \\epsilon $$\n" ] }, { "cell_type": "markdown", "metadata": { "id": "Pn1hX1aJqMmo" }, "source": [ "Here we are going to call a formula and use the GLM functionality" ] }, { "cell_type": "code", "metadata": { "id": "AWdix2ayqQGG", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "29bd9dd7-0720-484d-94eb-8e59273afc19" }, "source": [ "formula = 'survived ~ age + pclass'\n", "model = smf.glm(formula = formula, data=df, family=sm.families.Binomial())\n", "result=model.fit()\n", "print(result.summary())" ], "execution_count": 63, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " Generalized Linear Model Regression Results \n", "==========================================================================================\n", "Dep. Variable: ['survived[0]', 'survived[1]'] No. Observations: 1046\n", "Model: GLM Df Residuals: 1043\n", "Model Family: Binomial Df Model: 2\n", "Link Function: logit Scale: 1.0000\n", "Method: IRLS Log-Likelihood: -628.09\n", "Date: Sun, 29 May 2022 Deviance: 1256.2\n", "Time: 02:20:23 Pearson chi2: 1.05e+03\n", "No. Iterations: 4 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "Intercept -3.1143 0.327 -9.520 0.000 -3.755 -2.473\n", "age 0.0370 0.005 6.770 0.000 0.026 0.048\n", "pclass 1.1015 0.096 11.530 0.000 0.914 1.289\n", "==============================================================================\n" ] } ] }, { "cell_type": "markdown", "source": [ "# Interpretation\n", "\n", "- **Age** - for each year increase in age, the odds of survival on the titanic decreased about 1-0.96 = 0.04 or 4%\n", "- **Pclass** - for each marginal decrease in passenger class (ie 1st to 2nd class) the odds of survival decreased by 1-0.29 = 71%" ], "metadata": { "id": "_IKaZQJknrzY" } } ] }