{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Modeling in Python"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/anaconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n",
" from pandas.core import datetools\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import statsmodels.api as sm\n",
"import sklearn as sl"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" x0 | \n",
" x1 | \n",
" y | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0.01 | \n",
" -1.5 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" -0.01 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 0.25 | \n",
" 3.6 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" -4.10 | \n",
" 1.3 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 0.00 | \n",
" -2.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" x0 x1 y\n",
"0 1 0.01 -1.5\n",
"1 2 -0.01 0.0\n",
"2 3 0.25 3.6\n",
"3 4 -4.10 1.3\n",
"4 5 0.00 -2.0"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = {\"x0\" : [1, 2, 3, 4, 5], \n",
" \"x1\" : [0.01, -0.01, 0.25, -4.1, 0.0], \n",
" \"y\" : [-1.5, 0.0, 3.6, 1.3, -2.0]}\n",
"df = pd.DataFrame(data)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 1. , 0.01, -1.5 ],\n",
" [ 2. , -0.01, 0. ],\n",
" [ 3. , 0.25, 3.6 ],\n",
" [ 4. , -4.1 , 1.3 ],\n",
" [ 5. , 0. , -2. ]])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.values"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['x0', 'x1', 'y'], dtype='object')"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0.01 | \n",
" -1.5 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" -0.01 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 0.25 | \n",
" 3.6 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" -4.10 | \n",
" 1.3 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 0.00 | \n",
" -2.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three\n",
"0 1 0.01 -1.5\n",
"1 2 -0.01 0.0\n",
"2 3 0.25 3.6\n",
"3 4 -4.10 1.3\n",
"4 5 0.00 -2.0"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns = [\"one\", \"two\", \"three\"]\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## patsy"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"import patsy"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `patsy.dmatrices` function takes a formula string along with a dataset and produces design matrices for a linear model:"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"y, X = patsy.dmatrices(\"y ~ x0 + x1\", data = df)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DesignMatrix with shape (5, 1)\n",
" y\n",
" -1.5\n",
" 0.0\n",
" 3.6\n",
" 1.3\n",
" -2.0\n",
" Terms:\n",
" 'y' (column 0)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DesignMatrix with shape (5, 3)\n",
" Intercept x0 x1\n",
" 1 1 0.01\n",
" 1 2 -0.01\n",
" 1 3 0.25\n",
" 1 4 -4.10\n",
" 1 5 0.00\n",
" Terms:\n",
" 'Intercept' (column 0)\n",
" 'x0' (column 1)\n",
" 'x1' (column 2)"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## statsmodels"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"model = sm.OLS(y, X)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"fit = model.fit()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" OLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y R-squared: 0.042\n",
"Model: OLS Adj. R-squared: -0.915\n",
"Method: Least Squares F-statistic: 0.04431\n",
"Date: Mon, 30 Oct 2017 Prob (F-statistic): 0.958\n",
"Time: 03:13:21 Log-Likelihood: -10.515\n",
"No. Observations: 5 AIC: 27.03\n",
"Df Residuals: 2 BIC: 25.86\n",
"Df Model: 2 \n",
"Covariance Type: nonrobust \n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"Intercept 0.3129 3.313 0.094 0.933 -13.940 14.566\n",
"x0 -0.0791 1.057 -0.075 0.947 -4.628 4.470\n",
"x1 -0.2655 0.896 -0.296 0.795 -4.122 3.592\n",
"==============================================================================\n",
"Omnibus: nan Durbin-Watson: 1.653\n",
"Prob(Omnibus): nan Jarque-Bera (JB): 0.702\n",
"Skew: 0.875 Prob(JB): 0.704\n",
"Kurtosis: 2.447 Cond. No. 8.84\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/anaconda3/lib/python3.6/site-packages/statsmodels/stats/stattools.py:72: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given.\n",
" \"samples were given.\" % int(n), ValueWarning)\n"
]
}
],
"source": [
"print(fit.summary())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## scikit-learn"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import LogisticRegression"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PassengerId | \n",
" Survived | \n",
" Pclass | \n",
" Name | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 3 | \n",
" Braund, Mr. Owen Harris | \n",
" male | \n",
" 22.0 | \n",
" 1 | \n",
" 0 | \n",
" A/5 21171 | \n",
" 7.2500 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
" female | \n",
" 38.0 | \n",
" 1 | \n",
" 0 | \n",
" PC 17599 | \n",
" 71.2833 | \n",
" C85 | \n",
" C | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 1 | \n",
" 3 | \n",
" Heikkinen, Miss. Laina | \n",
" female | \n",
" 26.0 | \n",
" 0 | \n",
" 0 | \n",
" STON/O2. 3101282 | \n",
" 7.9250 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 1 | \n",
" 1 | \n",
" Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
" female | \n",
" 35.0 | \n",
" 1 | \n",
" 0 | \n",
" 113803 | \n",
" 53.1000 | \n",
" C123 | \n",
" S | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 0 | \n",
" 3 | \n",
" Allen, Mr. William Henry | \n",
" male | \n",
" 35.0 | \n",
" 0 | \n",
" 0 | \n",
" 373450 | \n",
" 8.0500 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PassengerId Survived Pclass \\\n",
"0 1 0 3 \n",
"1 2 1 1 \n",
"2 3 1 3 \n",
"3 4 1 1 \n",
"4 5 0 3 \n",
"\n",
" Name Sex Age SibSp \\\n",
"0 Braund, Mr. Owen Harris male 22.0 1 \n",
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
"2 Heikkinen, Miss. Laina female 26.0 0 \n",
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
"4 Allen, Mr. William Henry male 35.0 0 \n",
"\n",
" Parch Ticket Fare Cabin Embarked \n",
"0 0 A/5 21171 7.2500 NaN S \n",
"1 0 PC 17599 71.2833 C85 C \n",
"2 0 STON/O2. 3101282 7.9250 NaN S \n",
"3 0 113803 53.1000 C123 S \n",
"4 0 373450 8.0500 NaN S "
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train = pd.read_csv(\"datasets/titanic/train.csv\")\n",
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"impute_value = train['Age'].median()\n",
"train['Age'] = train['Age'].fillna(impute_value)\n",
"train['IsFemale'] = (train['Sex'] == 'female').astype(int)\n",
"predictors = ['Pclass', 'IsFemale', 'Age']\n",
"X_train = train[predictors].values\n",
"y_train = train['Survived'].values"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"model = LogisticRegression()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"fit = model.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
" penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
" verbose=0, warm_start=False)\n"
]
}
],
"source": [
"print(fit)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}