{ "cells": [ { "cell_type": "markdown", "metadata": { "toc": true }, "source": [ "

Regression: Table of Contents

\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This Notebook contain example of Regression problem, using Boston House Price dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:10:01.126902Z", "start_time": "2021-01-28T06:09:55.903195Z" } }, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.datasets import load_boston\n", "np.set_printoptions(suppress=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:10:03.338368Z", "start_time": "2021-01-28T06:10:03.281968Z" } }, "outputs": [], "source": [ "data = load_boston(return_X_y=False)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:10:12.476775Z", "start_time": "2021-01-28T06:10:12.469213Z" } }, "outputs": [], "source": [ "X = data['data']\n", "y = data['target']\n", "feature_names = data['feature_names']\n", "DSC = data['DESCR']" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:10:21.184246Z", "start_time": "2021-01-28T06:10:21.180218Z" } }, "outputs": [], "source": [ "#print(DSC)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:10:28.891868Z", "start_time": "2021-01-28T06:10:28.885885Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of features: 13\n", "Number of examples: 506\n" ] } ], "source": [ "print('Number of features:',X.shape[1])\n", "print('Number of examples:',X.shape[0])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:10:40.132042Z", "start_time": "2021-01-28T06:10:40.121717Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CRIM\t0.00632\n", "ZN\t18.0\n", "INDUS\t2.31\n", "CHAS\t0.0\n", "NOX\t0.538\n", "RM\t6.575\n", "AGE\t65.2\n", "DIS\t4.09\n", "RAD\t1.0\n", "TAX\t296.0\n", "PTRATIO\t15.3\n", "B\t396.9\n", "LSTAT\t4.98\n", "--------\n", "target : 24.0\n" ] } ], "source": [ "n=0\n", "for i in range(X[n].shape[0]):\n", " print(feature_names[i],X[n][i],sep='\\t')\n", "print('--------')\n", "print('target : ',y[n])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# A simpler problem\n", "## Only two features" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:11:03.664744Z", "start_time": "2021-01-28T06:11:03.657821Z" } }, "outputs": [], "source": [ "X = X[:,[5,12]]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:11:05.338142Z", "start_time": "2021-01-28T06:11:05.326169Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " x1 \t x2 \t| y\n", "______________________________\n", "6.575 \t 4.98 \t| 24.0\n", "6.421 \t 9.14 \t| 21.6\n", "7.185 \t 4.03 \t| 34.7\n", "6.998 \t 2.94 \t| 33.4\n", "7.147 \t 5.33 \t| 36.2\n", "6.43 \t 5.21 \t| 28.7\n", "6.012 \t 12.43 \t| 22.9\n", "6.172 \t 19.15 \t| 27.1\n", "5.631 \t 29.93 \t| 16.5\n", "6.004 \t 17.1 \t| 18.9\n" ] } ], "source": [ "print(' x1 \\t x2 \\t| y')\n", "print('_'*30)\n", "for xi,yi in zip(X[:10], y[:10]):\n", " print(xi[0],'\\t',xi[1],'\\t|',yi)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:11:10.362751Z", "start_time": "2021-01-28T06:11:10.066960Z" } }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.plot(X[:,0],y,'.',alpha=0.5)\n", "plt.xlabel('input: x')\n", "plt.ylabel('output: y')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#y = x1w1 + x2w2 + .." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using Least Square\n", "\n", "$$y' = Xw$$\n", "$.$\n", "$$min_w ||y-Xw||_2^2 $$\n", "$.$\n", "$$w = (X^{T}X)^{-1}X^{T}y$$" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:12:28.395668Z", "start_time": "2021-01-28T06:12:28.386460Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(506, 2) (506,)\n" ] } ], "source": [ "X = data['data'][:,[5,12]]\n", "y = data['target']\n", "print(X.shape, y.shape)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:12:57.960382Z", "start_time": "2021-01-28T06:12:57.950149Z" } }, "outputs": [ { "data": { "text/plain": [ "((506, 3), (506,))" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X1 = np.c_[np.ones(len(X)), X]\n", "X1.shape, y.shape" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:13:00.522286Z", "start_time": "2021-01-28T06:13:00.514925Z" } }, "outputs": [ { "data": { "text/plain": [ "array([[1. , 6.575, 4.98 ],\n", " [1. , 6.421, 9.14 ],\n", " [1. , 7.185, 4.03 ],\n", " ...,\n", " [1. , 6.976, 5.64 ],\n", " [1. , 6.794, 6.48 ],\n", " [1. , 6.03 , 7.88 ]])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Fitting (training)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:13:18.599468Z", "start_time": "2021-01-28T06:13:18.591457Z" } }, "outputs": [ { "data": { "text/plain": [ "array([64.49064447, 10.33577397, 4.81400466])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(((X1.T@X1)**(-1))@X1.T)@y" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:14:09.855105Z", "start_time": "2021-01-28T06:14:09.843281Z" } }, "outputs": [ { "data": { "text/plain": [ "array([-1.35827281, 5.09478798, -0.64235833])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.linalg.pinv(X1)@y" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:14:18.589772Z", "start_time": "2021-01-28T06:14:18.579799Z" } }, "outputs": [ { "data": { "text/plain": [ "array([-1.35827281, 5.09478798, -0.64235833])" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "w = np.linalg.pinv(X1)@y\n", "w" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:14:32.086328Z", "start_time": "2021-01-28T06:14:32.075907Z" } }, "outputs": [ { "data": { "text/plain": [ "array([[1. , 6.575, 4.98 ],\n", " [1. , 6.421, 9.14 ],\n", " [1. , 7.185, 4.03 ],\n", " ...,\n", " [1. , 6.976, 5.64 ],\n", " [1. , 6.794, 6.48 ],\n", " [1. , 6.03 , 7.88 ]])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X1" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:14:37.220575Z", "start_time": "2021-01-28T06:14:37.210052Z" } }, "outputs": [ { "data": { "text/plain": [ "array([-1.35827281, 5.09478798, -0.64235833])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "w" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:15:08.171512Z", "start_time": "2021-01-28T06:15:08.162573Z" } }, "outputs": [ { "data": { "text/plain": [ "24.0" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y[0]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:14:53.285065Z", "start_time": "2021-01-28T06:14:53.281534Z" } }, "outputs": [], "source": [ "yp = X1@w" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:15:02.133214Z", "start_time": "2021-01-28T06:15:02.125231Z" } }, "outputs": [ { "data": { "text/plain": [ "28.941013680602513" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "yp[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Mean Square Error" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:15:37.420670Z", "start_time": "2021-01-28T06:15:37.409672Z" } }, "outputs": [ { "data": { "text/plain": [ "30.51246877729947" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean((y - yp)**2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Mean Absolute Error" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:15:54.894509Z", "start_time": "2021-01-28T06:15:54.884621Z" } }, "outputs": [ { "data": { "text/plain": [ "3.952580067119268" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean(np.abs(y - yp))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using Sckit Learn Library" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:16:25.036090Z", "start_time": "2021-01-28T06:16:24.600035Z" } }, "outputs": [], "source": [ "from sklearn import linear_model" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:16:26.953252Z", "start_time": "2021-01-28T06:16:26.945257Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(506, 2) (506,)\n" ] } ], "source": [ "X = data['data'][:,[5,12]]\n", "y = data['target']\n", "print(X.shape, y.shape)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:16:38.633091Z", "start_time": "2021-01-28T06:16:38.628107Z" } }, "outputs": [], "source": [ "model = linear_model.LinearRegression()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:17:06.666126Z", "start_time": "2021-01-28T06:17:06.661212Z" } }, "outputs": [], "source": [ "#help(model)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Fitting (training)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:17:19.510856Z", "start_time": "2021-01-28T06:17:19.499027Z" } }, "outputs": [ { "data": { "text/plain": [ "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(X,y)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:17:29.190763Z", "start_time": "2021-01-28T06:17:29.181926Z" } }, "outputs": [ { "data": { "text/plain": [ "(-1.3582728118744818, array([ 5.09478798, -0.64235833]))" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.intercept_, model.coef_" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:17:54.766831Z", "start_time": "2021-01-28T06:17:54.761917Z" } }, "outputs": [], "source": [ "yp = model.predict(X)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Mean Square & Abosolute Error" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:18:07.871787Z", "start_time": "2021-01-28T06:18:07.863851Z" } }, "outputs": [ { "data": { "text/plain": [ "30.51246877729947" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean((y-yp)**2)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:18:10.915228Z", "start_time": "2021-01-28T06:18:10.907219Z" } }, "outputs": [ { "data": { "text/plain": [ "3.952580067119271" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean(np.abs((y-yp)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Let's use all the features" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:18:22.396248Z", "start_time": "2021-01-28T06:18:22.389141Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(506, 13) (506,)\n" ] } ], "source": [ "X = data['data']\n", "y = data['target']\n", "print(X.shape, y.shape)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:18:30.585057Z", "start_time": "2021-01-28T06:18:30.580027Z" } }, "outputs": [], "source": [ "model = linear_model.LinearRegression()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Fitting (training)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:18:32.650280Z", "start_time": "2021-01-28T06:18:32.641036Z" } }, "outputs": [ { "data": { "text/plain": [ "(36.45948838509001,\n", " array([ -0.10801136, 0.04642046, 0.02055863, 2.68673382,\n", " -17.76661123, 3.80986521, 0.00069222, -1.47556685,\n", " 0.30604948, -0.01233459, -0.95274723, 0.00931168,\n", " -0.52475838]))" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(X,y)\n", "model.intercept_, model.coef_" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:18:38.253878Z", "start_time": "2021-01-28T06:18:38.247531Z" } }, "outputs": [], "source": [ "yp = model.predict(X)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Mean Square & Abosolute Error" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:18:40.145221Z", "start_time": "2021-01-28T06:18:40.137243Z" } }, "outputs": [ { "data": { "text/plain": [ "21.894831181729206" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean((y-yp)**2)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:18:49.397039Z", "start_time": "2021-01-28T06:18:49.388902Z" } }, "outputs": [ { "data": { "text/plain": [ "3.270862810900317" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean(np.abs((y-yp)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Training & Testing" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:26:40.844901Z", "start_time": "2021-01-28T06:26:40.841254Z" } }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:26:42.527737Z", "start_time": "2021-01-28T06:26:42.517294Z" } }, "outputs": [ { "data": { "text/plain": [ "((354, 13), (354,), (152, 13), (152,))" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Xt, Xs, yt, ys = train_test_split(X,y,test_size=0.3)\n", "\n", "Xt.shape, yt.shape, Xs.shape, ys.shape" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:27:07.351160Z", "start_time": "2021-01-28T06:27:07.347460Z" } }, "outputs": [], "source": [ "model = linear_model.LinearRegression()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Training" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:27:16.725539Z", "start_time": "2021-01-28T06:27:16.716635Z" } }, "outputs": [ { "data": { "text/plain": [ "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(Xt,yt)\n", "#model.intercept_, model.coef_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Testing & Predicting" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:27:32.602435Z", "start_time": "2021-01-28T06:27:32.597413Z" } }, "outputs": [], "source": [ "ytp = model.predict(Xt)\n", "ysp = model.predict(Xs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Mean Square & Abosolute Error" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:27:43.440032Z", "start_time": "2021-01-28T06:27:43.430535Z" } }, "outputs": [ { "data": { "text/plain": [ "22.039530375706953" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean((yt-ytp)**2)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:27:54.612372Z", "start_time": "2021-01-28T06:27:54.602247Z" } }, "outputs": [ { "data": { "text/plain": [ "22.550610525430503" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean((ys-ysp)**2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Actual Prediction" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:28:07.618048Z", "start_time": "2021-01-28T06:28:07.610032Z" } }, "outputs": [ { "data": { "text/plain": [ "50.0" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "yt[0]" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:28:12.133665Z", "start_time": "2021-01-28T06:28:12.126720Z" } }, "outputs": [ { "data": { "text/plain": [ "40.489586293051204" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ytp[0]" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:28:22.853913Z", "start_time": "2021-01-28T06:28:22.846262Z" } }, "outputs": [ { "data": { "text/plain": [ "24.3" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ys[0]" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "ExecuteTime": { "end_time": "2021-01-28T06:28:29.640278Z", "start_time": "2021-01-28T06:28:29.631561Z" } }, "outputs": [ { "data": { "text/plain": [ "29.198709476914402" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ysp[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Regression: Table of Contents", "title_sidebar": "Contents", "toc_cell": true, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }