{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Modeling in Python" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ubuntu/anaconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", " from pandas.core import datetools\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import statsmodels.api as sm\n", "import sklearn as sl" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
x0x1y
010.01-1.5
12-0.010.0
230.253.6
34-4.101.3
450.00-2.0
\n", "
" ], "text/plain": [ " x0 x1 y\n", "0 1 0.01 -1.5\n", "1 2 -0.01 0.0\n", "2 3 0.25 3.6\n", "3 4 -4.10 1.3\n", "4 5 0.00 -2.0" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = {\"x0\" : [1, 2, 3, 4, 5], \n", " \"x1\" : [0.01, -0.01, 0.25, -4.1, 0.0], \n", " \"y\" : [-1.5, 0.0, 3.6, 1.3, -2.0]}\n", "df = pd.DataFrame(data)\n", "df" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 1. , 0.01, -1.5 ],\n", " [ 2. , -0.01, 0. ],\n", " [ 3. , 0.25, 3.6 ],\n", " [ 4. , -4.1 , 1.3 ],\n", " [ 5. , 0. , -2. ]])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.values" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['x0', 'x1', 'y'], dtype='object')" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothree
010.01-1.5
12-0.010.0
230.253.6
34-4.101.3
450.00-2.0
\n", "
" ], "text/plain": [ " one two three\n", "0 1 0.01 -1.5\n", "1 2 -0.01 0.0\n", "2 3 0.25 3.6\n", "3 4 -4.10 1.3\n", "4 5 0.00 -2.0" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns = [\"one\", \"two\", \"three\"]\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## patsy" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "import patsy" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The `patsy.dmatrices` function takes a formula string along with a dataset and produces design matrices for a linear model:" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "y, X = patsy.dmatrices(\"y ~ x0 + x1\", data = df)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DesignMatrix with shape (5, 1)\n", " y\n", " -1.5\n", " 0.0\n", " 3.6\n", " 1.3\n", " -2.0\n", " Terms:\n", " 'y' (column 0)" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DesignMatrix with shape (5, 3)\n", " Intercept x0 x1\n", " 1 1 0.01\n", " 1 2 -0.01\n", " 1 3 0.25\n", " 1 4 -4.10\n", " 1 5 0.00\n", " Terms:\n", " 'Intercept' (column 0)\n", " 'x0' (column 1)\n", " 'x1' (column 2)" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## statsmodels" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "model = sm.OLS(y, X)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "fit = model.fit()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: y R-squared: 0.042\n", "Model: OLS Adj. R-squared: -0.915\n", "Method: Least Squares F-statistic: 0.04431\n", "Date: Mon, 30 Oct 2017 Prob (F-statistic): 0.958\n", "Time: 03:13:21 Log-Likelihood: -10.515\n", "No. Observations: 5 AIC: 27.03\n", "Df Residuals: 2 BIC: 25.86\n", "Df Model: 2 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "Intercept 0.3129 3.313 0.094 0.933 -13.940 14.566\n", "x0 -0.0791 1.057 -0.075 0.947 -4.628 4.470\n", "x1 -0.2655 0.896 -0.296 0.795 -4.122 3.592\n", "==============================================================================\n", "Omnibus: nan Durbin-Watson: 1.653\n", "Prob(Omnibus): nan Jarque-Bera (JB): 0.702\n", "Skew: 0.875 Prob(JB): 0.704\n", "Kurtosis: 2.447 Cond. No. 8.84\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/ubuntu/anaconda3/lib/python3.6/site-packages/statsmodels/stats/stattools.py:72: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given.\n", " \"samples were given.\" % int(n), ValueWarning)\n" ] } ], "source": [ "print(fit.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## scikit-learn" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train = pd.read_csv(\"datasets/titanic/train.csv\")\n", "train.head()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "impute_value = train['Age'].median()\n", "train['Age'] = train['Age'].fillna(impute_value)\n", "train['IsFemale'] = (train['Sex'] == 'female').astype(int)\n", "predictors = ['Pclass', 'IsFemale', 'Age']\n", "X_train = train[predictors].values\n", "y_train = train['Survived'].values" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "model = LogisticRegression()" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "fit = model.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", " verbose=0, warm_start=False)\n" ] } ], "source": [ "print(fit)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }