{ "cells": [ { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Introduction to Modeling Libraries " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "np.random.seed(12345)\n", "import matplotlib.pyplot as plt\n", "plt.rc('figure', figsize=(10, 6))\n", "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n", "pd.options.display.max_rows = 20\n", "np.set_printoptions(precision=4, suppress=True)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Interfacing Between pandas and Model Code" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "data = pd.DataFrame({\n", " 'x0': [1, 2, 3, 4, 5],\n", " 'x1': [0.01, -0.01, 0.25, -4.1, 0.],\n", " 'y': [-1.5, 0., 3.6, 1.3, -2.]})\n", "data\n", "data.columns\n", "data.values" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "df2 = pd.DataFrame(data.values, columns=['one', 'two', 'three'])\n", "df2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "model_cols = ['x0', 'x1']\n", "data.loc[:, model_cols].values" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data['category'] = pd.Categorical(['a', 'b', 'a', 'a', 'b'],\n", " categories=['a', 'b'])\n", "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "dummies = pd.get_dummies(data.category, prefix='category')\n", "data_with_dummies = data.drop('category', axis=1).join(dummies)\n", "data_with_dummies" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Creating Model Descriptions with Patsy" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "y ~ x0 + x1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data = pd.DataFrame({\n", " 'x0': [1, 2, 3, 4, 5],\n", " 'x1': [0.01, -0.01, 0.25, -4.1, 0.],\n", " 'y': [-1.5, 0., 3.6, 1.3, -2.]})\n", "data\n", "import patsy\n", "y, X = patsy.dmatrices('y ~ x0 + x1', data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "y\n", "X" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "np.asarray(y)\n", "np.asarray(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "patsy.dmatrices('y ~ x0 + x1 + 0', data)[1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "coef, resid, _, _ = np.linalg.lstsq(X, y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "coef\n", "coef = pd.Series(coef.squeeze(), index=X.design_info.column_names)\n", "coef" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Data Transformations in Patsy Formulas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "y, X = patsy.dmatrices('y ~ x0 + np.log(np.abs(x1) + 1)', data)\n", "X" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "y, X = patsy.dmatrices('y ~ standardize(x0) + center(x1)', data)\n", "X" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "new_data = pd.DataFrame({\n", " 'x0': [6, 7, 8, 9],\n", " 'x1': [3.1, -0.5, 0, 2.3],\n", " 'y': [1, 2, 3, 4]})\n", "new_X = patsy.build_design_matrices([X.design_info], new_data)\n", "new_X" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "y, X = patsy.dmatrices('y ~ I(x0 + x1)', data)\n", "X" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Categorical Data and Patsy" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data = pd.DataFrame({\n", " 'key1': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'b'],\n", " 'key2': [0, 1, 0, 1, 0, 1, 0, 0],\n", " 'v1': [1, 2, 3, 4, 5, 6, 7, 8],\n", " 'v2': [-1, 0, 2.5, -0.5, 4.0, -1.2, 0.2, -1.7]\n", "})\n", "y, X = patsy.dmatrices('v2 ~ key1', data)\n", "X" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "y, X = patsy.dmatrices('v2 ~ key1 + 0', data)\n", "X" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "y, X = patsy.dmatrices('v2 ~ C(key2)', data)\n", "X" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data['key2'] = data['key2'].map({0: 'zero', 1: 'one'})\n", "data\n", "y, X = patsy.dmatrices('v2 ~ key1 + key2', data)\n", "X\n", "y, X = patsy.dmatrices('v2 ~ key1 + key2 + key1:key2', data)\n", "X" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Introduction to statsmodels" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Estimating Linear Models" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import statsmodels.api as sm\n", "import statsmodels.formula.api as smf" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "def dnorm(mean, variance, size=1):\n", " if isinstance(size, int):\n", " size = size,\n", " return mean + np.sqrt(variance) * np.random.randn(*size)\n", "\n", "# For reproducibility\n", "np.random.seed(12345)\n", "\n", "N = 100\n", "X = np.c_[dnorm(0, 0.4, size=N),\n", " dnorm(0, 0.6, size=N),\n", " dnorm(0, 0.2, size=N)]\n", "eps = dnorm(0, 0.1, size=N)\n", "beta = [0.1, 0.3, 0.5]\n", "\n", "y = np.dot(X, beta) + eps" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "X[:5]\n", "y[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "X_model = sm.add_constant(X)\n", "X_model[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "model = sm.OLS(y, X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "results = model.fit()\n", "results.params" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "print(results.summary())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data = pd.DataFrame(X, columns=['col0', 'col1', 'col2'])\n", "data['y'] = y\n", "data[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "results = smf.ols('y ~ col0 + col1 + col2', data=data).fit()\n", "results.params\n", "results.tvalues" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "results.predict(data[:5])" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Estimating Time Series Processes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "init_x = 4\n", "\n", "import random\n", "values = [init_x, init_x]\n", "N = 1000\n", "\n", "b0 = 0.8\n", "b1 = -0.4\n", "noise = dnorm(0, 0.1, N)\n", "for i in range(N):\n", " new_x = values[-1] * b0 + values[-2] * b1 + noise[i]\n", " values.append(new_x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "MAXLAGS = 5\n", "model = sm.tsa.AR(values)\n", "results = model.fit(MAXLAGS)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "results.params" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Introduction to scikit-learn" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "train = pd.read_csv('datasets/titanic/train.csv')\n", "test = pd.read_csv('datasets/titanic/test.csv')\n", "train[:4]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "train.isnull().sum()\n", "test.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "impute_value = train['Age'].median()\n", "train['Age'] = train['Age'].fillna(impute_value)\n", "test['Age'] = test['Age'].fillna(impute_value)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "train['IsFemale'] = (train['Sex'] == 'female').astype(int)\n", "test['IsFemale'] = (test['Sex'] == 'female').astype(int)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "predictors = ['Pclass', 'IsFemale', 'Age']\n", "X_train = train[predictors].values\n", "X_test = test[predictors].values\n", "y_train = train['Survived'].values\n", "X_train[:5]\n", "y_train[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", "model = LogisticRegression()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "model.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "y_predict = model.predict(X_test)\n", "y_predict[:10]" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "(y_true == y_predict).mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegressionCV\n", "model_cv = LogisticRegressionCV(10)\n", "model_cv.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.model_selection import cross_val_score\n", "model = LogisticRegression(C=10)\n", "scores = cross_val_score(model, X_train, y_train, cv=4)\n", "scores" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Continuing Your Education" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 0 }