{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Linear Models" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import addutils.toc ; addutils.toc.js(ipy_notebook=True)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "from addutils import css_notebook\n", "from sklearn.datasets.samples_generator import make_regression\n", "from sklearn import linear_model, neighbors\n", "from sklearn import metrics\n", "css_notebook()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " Loading BokehJS ...\n", "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", "(function(root) {\n", " function now() {\n", " return new Date();\n", " }\n", "\n", " var force = true;\n", "\n", " if (typeof (root._bokeh_onload_callbacks) === \"undefined\" || force === true) {\n", " root._bokeh_onload_callbacks = [];\n", " root._bokeh_is_loading = undefined;\n", " }\n", "\n", " var JS_MIME_TYPE = 'application/javascript';\n", " var HTML_MIME_TYPE = 'text/html';\n", " var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", " var CLASS_NAME = 'output_bokeh rendered_html';\n", "\n", " /**\n", " * Render data to the DOM node\n", " */\n", " function render(props, node) {\n", " var script = document.createElement(\"script\");\n", " node.appendChild(script);\n", " }\n", "\n", " /**\n", " * Handle when an output is cleared or removed\n", " */\n", " function handleClearOutput(event, handle) {\n", " var cell = handle.cell;\n", "\n", " var id = cell.output_area._bokeh_element_id;\n", " var server_id = cell.output_area._bokeh_server_id;\n", " // Clean up Bokeh references\n", " if (id !== undefined) {\n", " Bokeh.index[id].model.document.clear();\n", " delete Bokeh.index[id];\n", " }\n", "\n", " if (server_id !== undefined) {\n", " // Clean up Bokeh references\n", " var cmd = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", " cell.notebook.kernel.execute(cmd, {\n", " iopub: {\n", " output: function(msg) {\n", " var element_id = msg.content.text.trim();\n", " Bokeh.index[element_id].model.document.clear();\n", " delete Bokeh.index[element_id];\n", " }\n", " }\n", " });\n", " // Destroy server and session\n", " var cmd = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", " cell.notebook.kernel.execute(cmd);\n", " }\n", " }\n", "\n", " /**\n", " * Handle when a new output is added\n", " */\n", " function handleAddOutput(event, handle) {\n", " var output_area = handle.output_area;\n", " var output = handle.output;\n", "\n", " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", " if ((output.output_type != \"display_data\") || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n", " return\n", " }\n", "\n", " var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", "\n", " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", " toinsert[0].firstChild.textContent = output.data[JS_MIME_TYPE];\n", " // store reference to embed id on output_area\n", " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", " }\n", " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", " var bk_div = document.createElement(\"div\");\n", " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", " var script_attrs = bk_div.children[0].attributes;\n", " for (var i = 0; i < script_attrs.length; i++) {\n", " toinsert[0].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", " }\n", " // store reference to server id on output_area\n", " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", " }\n", " }\n", "\n", " function register_renderer(events, OutputArea) {\n", "\n", " function append_mime(data, metadata, element) {\n", " // create a DOM node to render to\n", " var toinsert = this.create_output_subarea(\n", " metadata,\n", " CLASS_NAME,\n", " EXEC_MIME_TYPE\n", " );\n", " this.keyboard_manager.register_events(toinsert);\n", " // Render to node\n", " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", " render(props, toinsert[0]);\n", " element.append(toinsert);\n", " return toinsert\n", " }\n", "\n", " /* Handle when an output is cleared or removed */\n", " events.on('clear_output.CodeCell', handleClearOutput);\n", " events.on('delete.Cell', handleClearOutput);\n", "\n", " /* Handle when a new output is added */\n", " events.on('output_added.OutputArea', handleAddOutput);\n", "\n", " /**\n", " * Register the mime type and append_mime function with output_area\n", " */\n", " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", " /* Is output safe? */\n", " safe: true,\n", " /* Index of renderer in `output_area.display_order` */\n", " index: 0\n", " });\n", " }\n", "\n", " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", " if (root.Jupyter !== undefined) {\n", " var events = require('base/js/events');\n", " var OutputArea = require('notebook/js/outputarea').OutputArea;\n", "\n", " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", " register_renderer(events, OutputArea);\n", " }\n", " }\n", "\n", " \n", " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", " root._bokeh_timeout = Date.now() + 5000;\n", " root._bokeh_failed_load = false;\n", " }\n", "\n", " var NB_LOAD_WARNING = {'data': {'text/html':\n", " \"
\\n\"+\n", " \"

\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"

\\n\"+\n", " \"
    \\n\"+\n", " \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n", " \"
  • use INLINE resources instead, as so:
  • \\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n", " \"from bokeh.resources import INLINE\\n\"+\n", " \"output_notebook(resources=INLINE)\\n\"+\n", " \"\\n\"+\n", " \"
\"}};\n", "\n", " function display_loaded() {\n", " var el = document.getElementById(\"304772ea-22b3-4367-be08-65649485e5af\");\n", " if (el != null) {\n", " el.textContent = \"BokehJS is loading...\";\n", " }\n", " if (root.Bokeh !== undefined) {\n", " if (el != null) {\n", " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", " }\n", " } else if (Date.now() < root._bokeh_timeout) {\n", " setTimeout(display_loaded, 100)\n", " }\n", " }\n", "\n", "\n", " function run_callbacks() {\n", " try {\n", " root._bokeh_onload_callbacks.forEach(function(callback) { callback() });\n", " }\n", " finally {\n", " delete root._bokeh_onload_callbacks\n", " }\n", " console.info(\"Bokeh: all callbacks have finished\");\n", " }\n", "\n", " function load_libs(js_urls, callback) {\n", " root._bokeh_onload_callbacks.push(callback);\n", " if (root._bokeh_is_loading > 0) {\n", " console.log(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", " return null;\n", " }\n", " if (js_urls == null || js_urls.length === 0) {\n", " run_callbacks();\n", " return null;\n", " }\n", " console.log(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", " root._bokeh_is_loading = js_urls.length;\n", " for (var i = 0; i < js_urls.length; i++) {\n", " var url = js_urls[i];\n", " var s = document.createElement('script');\n", " s.src = url;\n", " s.async = false;\n", " s.onreadystatechange = s.onload = function() {\n", " root._bokeh_is_loading--;\n", " if (root._bokeh_is_loading === 0) {\n", " console.log(\"Bokeh: all BokehJS libraries loaded\");\n", " run_callbacks()\n", " }\n", " };\n", " s.onerror = function() {\n", " console.warn(\"failed to load library \" + url);\n", " };\n", " console.log(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", " document.getElementsByTagName(\"head\")[0].appendChild(s);\n", " }\n", " };var element = document.getElementById(\"304772ea-22b3-4367-be08-65649485e5af\");\n", " if (element == null) {\n", " console.log(\"Bokeh: ERROR: autoload.js configured with elementid '304772ea-22b3-4367-be08-65649485e5af' but no matching script tag was found. \")\n", " return false;\n", " }\n", "\n", " var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-0.12.13.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.13.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-0.12.13.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-0.12.13.min.js\"];\n", "\n", " var inline_js = [\n", " function(Bokeh) {\n", " Bokeh.set_log_level(\"info\");\n", " },\n", " \n", " function(Bokeh) {\n", " \n", " },\n", " function(Bokeh) {\n", " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.12.13.min.css\");\n", " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-0.12.13.min.css\");\n", " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.13.min.css\");\n", " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.13.min.css\");\n", " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-tables-0.12.13.min.css\");\n", " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-tables-0.12.13.min.css\");\n", " }\n", " ];\n", "\n", " function run_inline_js() {\n", " \n", " if ((root.Bokeh !== undefined) || (force === true)) {\n", " for (var i = 0; i < inline_js.length; i++) {\n", " inline_js[i].call(root, root.Bokeh);\n", " }if (force === true) {\n", " display_loaded();\n", " }} else if (Date.now() < root._bokeh_timeout) {\n", " setTimeout(run_inline_js, 100);\n", " } else if (!root._bokeh_failed_load) {\n", " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", " root._bokeh_failed_load = true;\n", " } else if (force !== true) {\n", " var cell = $(document.getElementById(\"304772ea-22b3-4367-be08-65649485e5af\")).parents('.cell').data().cell;\n", " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", " }\n", "\n", " }\n", "\n", " if (root._bokeh_is_loading === 0) {\n", " console.log(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", " run_inline_js();\n", " } else {\n", " load_libs(js_urls, function() {\n", " console.log(\"Bokeh: BokehJS plotting callback run at\", now());\n", " run_inline_js();\n", " });\n", " }\n", "}(window));" ], "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n\n if (typeof (root._bokeh_onload_callbacks) === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n \n\n \n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n var NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"
    \\n\"+\n \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n \"
  • use INLINE resources instead, as so:
  • \\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n var el = document.getElementById(\"304772ea-22b3-4367-be08-65649485e5af\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) { callback() });\n }\n finally {\n delete root._bokeh_onload_callbacks\n }\n console.info(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(js_urls, callback) {\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.log(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.log(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = js_urls.length;\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n var s = document.createElement('script');\n s.src = url;\n s.async = false;\n s.onreadystatechange = s.onload = function() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.log(\"Bokeh: all BokehJS libraries loaded\");\n run_callbacks()\n }\n };\n s.onerror = function() {\n console.warn(\"failed to load library \" + url);\n };\n console.log(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.getElementsByTagName(\"head\")[0].appendChild(s);\n }\n };var element = document.getElementById(\"304772ea-22b3-4367-be08-65649485e5af\");\n if (element == null) {\n console.log(\"Bokeh: ERROR: autoload.js configured with elementid '304772ea-22b3-4367-be08-65649485e5af' but no matching script tag was found. \")\n return false;\n }\n\n var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-0.12.13.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.13.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-0.12.13.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-0.12.13.min.js\"];\n\n var inline_js = [\n function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\n \n function(Bokeh) {\n \n },\n function(Bokeh) {\n console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.12.13.min.css\");\n Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-0.12.13.min.css\");\n console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.13.min.css\");\n Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.13.min.css\");\n console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-tables-0.12.13.min.css\");\n Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-tables-0.12.13.min.css\");\n }\n ];\n\n function run_inline_js() {\n \n if ((root.Bokeh !== undefined) || (force === true)) {\n for (var i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }if (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n var cell = $(document.getElementById(\"304772ea-22b3-4367-be08-65649485e5af\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n\n }\n\n if (root._bokeh_is_loading === 0) {\n console.log(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(js_urls, function() {\n console.log(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import bokeh.plotting as bk\n", "from bokeh import palettes\n", "from bokeh.models import Range1d, FixedTicker\n", "from bokeh.layouts import gridplot\n", "bk.output_notebook()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "%matplotlib notebook" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1 Introduction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this notebook we introduce linear models for regression and classification. This introduction serves also to introduce the main terminology of ML as well one important aspect of ML practice: the *curse of dimensionality*." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2 Linear Model for Regression" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`LinearRegression` fits a linear model with coefficients $w = (w_1, \\ldots, w_p)$ to minimize the residual sum of squares between the observed responses in the dataset, and the responses predicted by the linear approximation.\n", "\n", "Mathematically it solves a problem of the form: \n", "$$ \\underset{w}{min} \\|Xw -y\\|_2^2$$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.1 Simple Linear Regression" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Simple linear regression is an approach for predicting a **quantitative response** using a **single feature** (or \"predictor\" or \"input variable\"). It has the following mathematical definition:\n", "\n", "$y = \\beta_0 + \\beta_1x$\n", "\n", "Where:\n", "- $y$ is the response or **target**\n", "- $x$ is the **feature**\n", "- $\\beta_0$ is the intercept\n", "- $\\beta_1$ is the coefficient for x\n", "\n", "$\\beta_0$ and $\\beta_1$ are called the **model coefficients** or **parameters**. To build the model, it is necessary to **learn** the values of these coefficients. And once the model has learned these coefficients, it can be used for prediction. We are learning a function, sometimes called **hypothesis** that is $h(x) = \\beta_0 + \\beta_1x$\n", "\n", "In supervised learning we are going to learn model from examples. What are examples? They are given couples feature, response. For example suppose we want to learn to predict the price of a house based on its size in feet$^2$. You are presented a **dataset** that is a table with two columns one that represent the size and the other that represent the price." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "An **example** is a line in the table. The number of example $n$ is the size of the dataset. A couple $(x, y)$ is one training example, $(x_i, y_i)$ refers to the $i^{th}$ training example." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.2 Interpreting Model Coefficients" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How do we interpret the coefficient $\\beta_1$? A *unit* **increase** in the variable $x$ is **associated with** $\\beta_1$ *unit* **increase** in variable $y$. Note that if an increase in $x$ is associated with a **decrease** in $y$, $\\beta_1$ would be **negative**." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "X, y = make_regression(n_samples=100000, n_features=1, n_informative=1,\n", " random_state=0, noise=35)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "(function(root) {\n", " function embed_document(root) {\n", " \n", " var docs_json = {\"b7430cf6-2192-4126-94cf-d8afac44c738\":{\"roots\":{\"references\":[{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"lightgrey\"},\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":{\"value\":1.0},\"line_color\":{\"value\":\"black\"},\"line_dash\":[4,4],\"line_width\":{\"value\":2},\"plot\":null,\"render_mode\":\"css\",\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"af5271e0-6765-45fc-9cdc-cb010c3777b7\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"callback\":null},\"id\":\"b2eecebe-2b4a-4b31-9b9b-b605086532b6\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"3690e622-e9e1-4382-83ca-03e03a9905a9\",\"type\":\"PanTool\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"3690e622-e9e1-4382-83ca-03e03a9905a9\",\"type\":\"PanTool\"},{\"id\":\"320819a5-d997-44a7-a52b-7293bf9d0ca0\",\"type\":\"WheelZoomTool\"},{\"id\":\"f503e879-2b53-4cb8-a257-dac48f22f035\",\"type\":\"BoxZoomTool\"},{\"id\":\"c8febe0d-e041-498d-ab89-fdf8f5241478\",\"type\":\"SaveTool\"},{\"id\":\"9efc7dad-5a81-49d9-a34e-af7c76447a53\",\"type\":\"ResetTool\"},{\"id\":\"3655bffb-b291-41c9-9eeb-b69b64a7e648\",\"type\":\"HelpTool\"}]},\"id\":\"b1d9f31e-e6be-4241-82f0-f1291d681d79\",\"type\":\"Toolbar\"},{\"attributes\":{\"fill_color\":{\"value\":\"black\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"b9014e0b-9ff4-4e18-8401-bebbab70b10f\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"320819a5-d997-44a7-a52b-7293bf9d0ca0\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"overlay\":{\"id\":\"af5271e0-6765-45fc-9cdc-cb010c3777b7\",\"type\":\"BoxAnnotation\"}},\"id\":\"f503e879-2b53-4cb8-a257-dac48f22f035\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"line_color\":\"red\",\"line_width\":3,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"14e0bbe2-0b8b-413a-89e5-4beee5e7220d\",\"type\":\"Line\"},{\"attributes\":{},\"id\":\"c8febe0d-e041-498d-ab89-fdf8f5241478\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1f91b89b-f2a7-4da0-bcea-f948bca54409\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"9efc7dad-5a81-49d9-a34e-af7c76447a53\",\"type\":\"ResetTool\"},{\"attributes\":{\"callback\":null},\"id\":\"51461341-ca77-41df-a0fb-a95e21a4f600\",\"type\":\"DataRange1d\"},{\"attributes\":{\"callback\":null,\"column_names\":[\"x\",\"y\"],\"data\":{\"x\":{\"__ndarray__\":\"MqNhARb47z/xq5BPfCXwPwoV11+YwH8/+OB1NJ4u6j+UnUAMN7TuvwI9nTkwj/i/mg4SYvTd8D8upzHE0Hz6P4Iha/CxjvS/IWCvl2sc9r/wh7HgDBjIPxhfXwUXf/G/9lsYNUWx4T+PiounKkL6P4XiHNE6/vy/WFkCwlYM4L81AycenmnnP9mV7eZVxsK/tVQpYCFa8L8wNELzK++wP6hvNYGW6Na/GTzlruiq978b9nYKuJ7tP61nUWXKweo/WrRpmaFT47/lqMgGVHf0P0KcFdSIBwBA4wlsELGI4D+vhc/9qVfSv09M47hI1QlAo5i2DiSj8r8SgTPnrPTwP5AKD3ZoEsQ/8MDx3O2W1z8zkiKLHunKPx9eMUTM85g/KLMYS8mt4D/yeW3e4vTPP6t4atNROPW/2YUTzmAZ8D+qEaaB1nfuv5LiWYyX7c6/nZuf9OsK+D+04b42qdjVPwZG8ZaijNc/D7ELYtSt3D/gHC4CZEDov4k85SuoqvI/Huhf90Ksqr+3hGfOUqzfP5CTPjhXKem/u31ZcZR0z7/kkLxWjvzmv7KKfsNYNe6/gpJ4EQ5jy7/yTNz1JuXNv7H+7WuzVAPAy5bBgvjeyb+igHDcolzWP+TQDa2wp/I/0HqaRczU6r/bgPzJBcn7vxrL5y0lrfC/MDqqh/8Z8j8LFNUNlt74P18VC34N/No//YiOjl5Z+L/jW0APwgLwPyHRzDjse8m/urB9OU/zAsCCVwZfS9pvv6JuW36dtuQ/yLbpi8t//T9//7VBbpC9v4cCOncTKPY/U0kQqyuQwb8tUEqGLjzzv9LMEzwpN/c/FmZ1NG6c0b+YjH9ar3bsvxKIRq1GyNg/b6GxptwC1z+PT4uHrjbmvytM9iksStE/m+DoE+hT9L9mgjBdBvnyv78iGoBbqeG/lx0Mxuc/9j9t7ZVqclz3v1UYLCeFPvS/bSbjGLbQwD8rEI4ili/mv6irRA6xIOW/bdZ6NmUB+T81Jru8cAAAwG3pvW24qT2/TTH4IuJKAEBMtV/aW4zxP0RUTZjVYao/ckpVmKLO5j8=\",\"dtype\":\"float64\",\"shape\":[100]},\"y\":{\"__ndarray__\":\"fYYIQhndU0CL4li5dhBUQE3y3ZDwVOo/3s4UQwNHUEDfcnwNxvlSwGea3zk0Y17AVLE+hxn1VEAoad5/0XBgQNuxPkZrbVnAVsR12l5aW8DTaSkxUkcuQKC3rIoyolXAChOJr1EIRkARYaYMeUxgQMPzt6wA8WHAIO2CnV7JQ8BtZWRnRR9NQHBwOOGh2ibAoletaRg3VMCCgmWJEtIVQCmwRal+LzzAFc6hqURIXcAfNRxdbGhSQAFrNLY3olBAPSVHh5zZR8BfNiZIE2tZQIZDFuav5GNAomvhOruYREA638XXsoY2wNH/imyrBXBAbotqXiwMV8ArmZiuQhFVQA0HZ4MnSylA3CEbmQxyPUB9h2dPhOIwQIrWQ3l9IAFAtfedHbXGREAJ2pPbCgM0QGSMYgaoP1rAIDSlHHUBVEBvuElsW9RSwCdHqnxn9TLAIFd5CePZXUDyq0ep7kg7QHYYRF5KZT1AF9+rJnPgQUATSlMnO/RNwFjSS1IbMFdACSxWzT9lD8CwOG3ddLtDQGEyian0FE/A+Sm93Q5JM8Bfy6mP3GJMwHYhEdEmq1LAfxhIYpzDMMAHY838hlEywJSTt4bm7mfA5TfcNjimL8CDhsODgew7QAteZ/ttLFdAQ9UhAmGTUMAkjPD7YTFhwGcleXr8nVTAF6n+DdB8VkBnlD/EOuBeQNveleCh00BABw7azn8gXsBuGWrna+VTQGgZDPB0Ky/AX+ly4TB2Z8DMM6vxALm5v1EkJubSxklATGWl3ZpOYkDqMzLlyechwA+u7kxvg1tAfutqlTRaJcCD2vs628lXwNzdK8ps01xATyHT4KKeNcDHE8RnWZZRwNGr4SiB7D5ANy3914e6PEBOfElenG1LwLBck5QnozVArPy9EI4kWcDwCR3MnnZXwH/5oalGyUXAY+VKMPigW0AYC/kxBedcwOUNUUsMClnAgQxEDdlBJUCgSlMU0WRLwGndAfAPFUrAxXsCgF8LX0Dhubvolc5jwF2yXZKrBcY/fwsqPyk4ZEC9oEDSQs1VQBkfALhiLhFA/MRbei5fTEA=\",\"dtype\":\"float64\",\"shape\":[100]}}},\"id\":\"b63b1233-20ae-4948-bb11-3f60f9eafa47\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"callback\":null,\"column_names\":[\"x\",\"y\"],\"data\":{\"x\":{\"__ndarray__\":\"MqNhARb47z/xq5BPfCXwPwoV11+YwH8/+OB1NJ4u6j+UnUAMN7TuvwI9nTkwj/i/mg4SYvTd8D8upzHE0Hz6P4Iha/CxjvS/IWCvl2sc9r/wh7HgDBjIPxhfXwUXf/G/9lsYNUWx4T+PiounKkL6P4XiHNE6/vy/WFkCwlYM4L81AycenmnnP9mV7eZVxsK/tVQpYCFa8L8wNELzK++wP6hvNYGW6Na/GTzlruiq978b9nYKuJ7tP61nUWXKweo/WrRpmaFT47/lqMgGVHf0P0KcFdSIBwBA4wlsELGI4D+vhc/9qVfSv09M47hI1QlAo5i2DiSj8r8SgTPnrPTwP5AKD3ZoEsQ/8MDx3O2W1z8zkiKLHunKPx9eMUTM85g/KLMYS8mt4D/yeW3e4vTPP6t4atNROPW/2YUTzmAZ8D+qEaaB1nfuv5LiWYyX7c6/nZuf9OsK+D+04b42qdjVPwZG8ZaijNc/D7ELYtSt3D/gHC4CZEDov4k85SuoqvI/Huhf90Ksqr+3hGfOUqzfP5CTPjhXKem/u31ZcZR0z7/kkLxWjvzmv7KKfsNYNe6/gpJ4EQ5jy7/yTNz1JuXNv7H+7WuzVAPAy5bBgvjeyb+igHDcolzWP+TQDa2wp/I/0HqaRczU6r/bgPzJBcn7vxrL5y0lrfC/MDqqh/8Z8j8LFNUNlt74P18VC34N/No//YiOjl5Z+L/jW0APwgLwPyHRzDjse8m/urB9OU/zAsCCVwZfS9pvv6JuW36dtuQ/yLbpi8t//T9//7VBbpC9v4cCOncTKPY/U0kQqyuQwb8tUEqGLjzzv9LMEzwpN/c/FmZ1NG6c0b+YjH9ar3bsvxKIRq1GyNg/b6GxptwC1z+PT4uHrjbmvytM9iksStE/m+DoE+hT9L9mgjBdBvnyv78iGoBbqeG/lx0Mxuc/9j9t7ZVqclz3v1UYLCeFPvS/bSbjGLbQwD8rEI4ili/mv6irRA6xIOW/bdZ6NmUB+T81Jru8cAAAwG3pvW24qT2/TTH4IuJKAEBMtV/aW4zxP0RUTZjVYao/ckpVmKLO5j8=\",\"dtype\":\"float64\",\"shape\":[100]},\"y\":{\"__ndarray__\":\"HKyN58TfNkAqcCmmHGtLQKfLccn0vDHAnJ8AJl76UUCsp1WUWqljwKIHgeqGVlTA9tbCX+HeWECBjYHasbNiQMvjOqK1F2PAThyQJb67YcBZ4Wa5fTQwwL2avafZXlzAbqrnAhfFU0CcutvIKJ5lQFr5kqGx7WbA/ai+4iWOQcByzvurxLlTQIb6JFZ6ODzAq67P/ymQWcCuaCNKwAszQLyF4yx2oSzANppNmW8ZTMB4JDaGTshmQJD3pI1nKjpAiGLwGul6QsAsCuZOEmJJQLbz+3ePA2BALD6q3e1UIsBvuY2rU2w5QIxSsCAlTHFANOunMO9eQcBX3cFVYDFCQBCS7xah2v2/MrpXblTTQ8A7cj5YFnQ+wDrbJxVw3yJAf5IsVPcDQUBR9iclGOtSQGJbQcoLklbAKRi7qq96VEDGtPWytmhbwBDpl7XbMiZAj2cryny2ZEA8zFgguD4dQA4lMw3Sp0JAkXU7KBQKREB+MWh9I/c3wAFQEbuxCVZATYR9CJcUNMBJbCk9U+05QOiHOMpwYFfANjzQ/RnrRMAalzQSgA47wGahuGE0QUzA9QftGJJCSsAufSsCT9EzQF+C1gciIGPAKdtpRq9mRcB6l4qBNQE7QGANk/m4lVdA8JzBNdyXWcCHjH6vBBFfwGCdfP3y1VnAcOBJh7WLX0Damn+bvFdSQLj6ywMM2ynACZ5AoSPlXsA2E/oDqF9JQO622A9LPUjA/NDJp/X7acA5oAWyoS8yQDOe6hzkMlRAT+338HNiZ0DZMTKDvawxQJ4hHQ/VS1pApHlcEj2oN8AG+Iht9PlfwK3rJrF5p2FAPDlm1osBHkA3Yu/Y6WhWwCbbCjvnbDBAbed7Zn1fS0DQ+9q3LoNVwL/bJgGUIFRAE/3bQeVgZ8BgKfH7YGIwwIm5h0QbsTLA8CaETUYHXEBcs+iFvLZdwE4/qWmmAk/AbvyD5A1mOMCN9V8iiQZAwF5pxKBjK1nAWz8lSoX7YUASnd7VmkhlwN7w6vNNBDdAQZ2OKZUzaUAFU9iuNzFFQIGj7QzEVT3AmFqPMWWtV0A=\",\"dtype\":\"float64\",\"shape\":[100]}}},\"id\":\"100d3740-17e2-40ff-b9d2-c273e401dd4d\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data_source\":{\"id\":\"100d3740-17e2-40ff-b9d2-c273e401dd4d\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"b9014e0b-9ff4-4e18-8401-bebbab70b10f\",\"type\":\"Circle\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"82e6319c-54c6-413c-bec4-d21c40468eb6\",\"type\":\"Circle\"},\"selection_glyph\":null,\"view\":{\"id\":\"4665b65c-ee36-420d-9123-6e73a260a637\",\"type\":\"CDSView\"}},\"id\":\"a403387b-d86d-40f6-b8ee-2a5867b8cb9c\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"0db0bc1f-fb0b-4279-826d-bb6b5c95f7d7\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"3655bffb-b291-41c9-9eeb-b69b64a7e648\",\"type\":\"HelpTool\"},{\"attributes\":{\"data_source\":{\"id\":\"b63b1233-20ae-4948-bb11-3f60f9eafa47\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"14e0bbe2-0b8b-413a-89e5-4beee5e7220d\",\"type\":\"Line\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"af09abe1-a473-4778-bc78-9c3fbd9e8064\",\"type\":\"Line\"},\"selection_glyph\":null,\"view\":{\"id\":\"133bcd5e-f149-4069-906a-986a4bd27efc\",\"type\":\"CDSView\"}},\"id\":\"c210922a-ad46-414c-8556-20a4c94ef6c7\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"01417afe-7327-451f-b307-800e3b68e3f6\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"source\":{\"id\":\"100d3740-17e2-40ff-b9d2-c273e401dd4d\",\"type\":\"ColumnDataSource\"}},\"id\":\"4665b65c-ee36-420d-9123-6e73a260a637\",\"type\":\"CDSView\"},{\"attributes\":{\"formatter\":{\"id\":\"27cf2afe-4d06-4795-958f-d8ec8be11d3e\",\"type\":\"BasicTickFormatter\"},\"minor_tick_out\":0,\"plot\":{\"id\":\"535181ca-3303-40f6-9f50-3635584fc84f\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"4472b36c-a96e-48bc-8665-770721e1bc5c\",\"type\":\"BasicTicker\"}},\"id\":\"5098a458-6de6-415a-b67d-e85105161db4\",\"type\":\"LinearAxis\"},{\"attributes\":{\"source\":{\"id\":\"b63b1233-20ae-4948-bb11-3f60f9eafa47\",\"type\":\"ColumnDataSource\"}},\"id\":\"133bcd5e-f149-4069-906a-986a4bd27efc\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"4472b36c-a96e-48bc-8665-770721e1bc5c\",\"type\":\"BasicTicker\"},{\"attributes\":{\"grid_line_color\":{\"value\":null},\"plot\":{\"id\":\"535181ca-3303-40f6-9f50-3635584fc84f\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"4472b36c-a96e-48bc-8665-770721e1bc5c\",\"type\":\"BasicTicker\"}},\"id\":\"00721d2e-82b9-4612-a7e8-a6a0ae5614e0\",\"type\":\"Grid\"},{\"attributes\":{\"formatter\":{\"id\":\"01417afe-7327-451f-b307-800e3b68e3f6\",\"type\":\"BasicTickFormatter\"},\"minor_tick_out\":0,\"plot\":{\"id\":\"535181ca-3303-40f6-9f50-3635584fc84f\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"fd0f50f8-f130-4c88-86b5-ceea755dec74\",\"type\":\"BasicTicker\"}},\"id\":\"ea1534f2-bd50-4bdf-9abc-89c9777d2707\",\"type\":\"LinearAxis\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"82e6319c-54c6-413c-bec4-d21c40468eb6\",\"type\":\"Circle\"},{\"attributes\":{\"below\":[{\"id\":\"5098a458-6de6-415a-b67d-e85105161db4\",\"type\":\"LinearAxis\"}],\"left\":[{\"id\":\"ea1534f2-bd50-4bdf-9abc-89c9777d2707\",\"type\":\"LinearAxis\"}],\"plot_height\":300,\"plot_width\":630,\"renderers\":[{\"id\":\"5098a458-6de6-415a-b67d-e85105161db4\",\"type\":\"LinearAxis\"},{\"id\":\"00721d2e-82b9-4612-a7e8-a6a0ae5614e0\",\"type\":\"Grid\"},{\"id\":\"ea1534f2-bd50-4bdf-9abc-89c9777d2707\",\"type\":\"LinearAxis\"},{\"id\":\"855fa90d-014b-475a-90fa-548cfb8ca537\",\"type\":\"Grid\"},{\"id\":\"af5271e0-6765-45fc-9cdc-cb010c3777b7\",\"type\":\"BoxAnnotation\"},{\"id\":\"a403387b-d86d-40f6-b8ee-2a5867b8cb9c\",\"type\":\"GlyphRenderer\"},{\"id\":\"c210922a-ad46-414c-8556-20a4c94ef6c7\",\"type\":\"GlyphRenderer\"}],\"title\":null,\"toolbar\":{\"id\":\"b1d9f31e-e6be-4241-82f0-f1291d681d79\",\"type\":\"Toolbar\"},\"x_range\":{\"id\":\"b2eecebe-2b4a-4b31-9b9b-b605086532b6\",\"type\":\"DataRange1d\"},\"x_scale\":{\"id\":\"1f91b89b-f2a7-4da0-bcea-f948bca54409\",\"type\":\"LinearScale\"},\"y_range\":{\"id\":\"51461341-ca77-41df-a0fb-a95e21a4f600\",\"type\":\"DataRange1d\"},\"y_scale\":{\"id\":\"0db0bc1f-fb0b-4279-826d-bb6b5c95f7d7\",\"type\":\"LinearScale\"}},\"id\":\"535181ca-3303-40f6-9f50-3635584fc84f\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"fd0f50f8-f130-4c88-86b5-ceea755dec74\",\"type\":\"BasicTicker\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":3,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"af09abe1-a473-4778-bc78-9c3fbd9e8064\",\"type\":\"Line\"},{\"attributes\":{\"dimension\":1,\"grid_line_color\":{\"value\":null},\"plot\":{\"id\":\"535181ca-3303-40f6-9f50-3635584fc84f\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"fd0f50f8-f130-4c88-86b5-ceea755dec74\",\"type\":\"BasicTicker\"}},\"id\":\"855fa90d-014b-475a-90fa-548cfb8ca537\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"27cf2afe-4d06-4795-958f-d8ec8be11d3e\",\"type\":\"BasicTickFormatter\"}],\"root_ids\":[\"535181ca-3303-40f6-9f50-3635584fc84f\"]},\"title\":\"Bokeh Application\",\"version\":\"0.12.13\"}};\n", " var render_items = [{\"docid\":\"b7430cf6-2192-4126-94cf-d8afac44c738\",\"elementid\":\"791bfe86-6552-49ec-9f2d-64fbc3e596bf\",\"modelid\":\"535181ca-3303-40f6-9f50-3635584fc84f\"}];\n", " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", "\n", " }\n", " if (root.Bokeh !== undefined) {\n", " embed_document(root);\n", " } else {\n", " var attempts = 0;\n", " var timer = setInterval(function(root) {\n", " if (root.Bokeh !== undefined) {\n", " embed_document(root);\n", " clearInterval(timer);\n", " }\n", " attempts++;\n", " if (attempts > 100) {\n", " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\")\n", " clearInterval(timer);\n", " }\n", " }, 10, root)\n", " }\n", "})(window);" ], "application/vnd.bokehjs_exec.v0+json": "" }, "metadata": { "application/vnd.bokehjs_exec.v0+json": { "id": "535181ca-3303-40f6-9f50-3635584fc84f" } }, "output_type": "display_data" } ], "source": [ "regr = linear_model.LinearRegression()\n", "regr.fit(X, y)\n", "\n", "fig = bk.figure(plot_width=630, plot_height=300, title=None)\n", "fig.circle(X[::1000, 0], y[::1000], color='black')\n", "fig.line(X[::1000, 0], regr.predict(X[::1000]), color='red', line_width=3)\n", "fig.grid.grid_line_color = None\n", "fig.axis.minor_tick_out = 0\n", "bk.show(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.3 Estimating Coefficients" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Generally speaking, coefficients are estimated using the **least squares criterion**, which finds the line that minimizes the **sum of squared residuals** (or \"sum of squared errors\"):" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Where:\n", "- The black dots are the **observed values** of x and y.\n", "- The blue line is our **least squares line**.\n", "- The red lines are the **residuals**, which are the distances between the observed values and the least squares line.\n", "\n", "How do the model coefficients relate to the least squares line?\n", "- $\\beta_0$ is the **intercept** (the value of $y$ when $x$=0)\n", "- $\\beta_1$ is the **slope** (the change in $y$ divided by change in $x$)\n", "\n", "Here is a graphical representation:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Different choices of $\\beta_0$ and $\\beta_1$ give us different hypothesis. How do I choose $\\beta_0$ and $\\beta_1$ in order to get the best regression line? That is we want to find the best couple of $\\beta_0$ and $\\beta_1$ that minimizes the difference between the output of our hypothesis and the target. Using the notation above:\n", "$$min_{\\beta_0, \\beta_1} \\quad J(\\beta_0, \\beta_1) = \\frac{1}{2n}\\sum_i^n (h(x_i)-y_i)^2$$\n", "the second part of the expression is the **cost function** (*mean squared error* in this case) and it depends on $\\beta_0$ and $\\beta_1$, so changing their values changes the cost. Why we use the squared error? It is one of the most used functions, but there are many other. We will see why it is useful. \n", "\n", "To recap, we have:\n", "- an *hypothesis*: the equation of the line\n", "- *parameters* to learn: $\\beta_0$ and $\\beta_1$\n", "- *cost function*: mean squared error\n", "- *goal*: minimize error\n", "\n", "In order to ease the discussion we will make an example with a single parameter hypothesis: $h(x) = \\beta_1x$; we are going to find a value for $\\beta_1$ that minimize $J(\\beta_1) = \\frac{1}{2n}\\sum_i^n (h(x_i)-y_i)^2$. How does varying the parameter affect the cost function? How varies the error?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What happens if we keep both the coefficients $\\beta_0$ and $\\beta_1$? We have a quadratic bowl, such as the one in the picture below. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.4 Learning algorithm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How do we learn the parameters? We have a quadratic surface, how about starting with some random value for the parameters $\\beta_0$, $\\beta_1$ and then keep changing them until the cost function reaches its minimum? There could be different **local minima** as well as a **global minima**. Changing the initialization slightly changes the outcome of the optimization. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What is the update step? We are moving in the direction of the slove, using the derivative of the error function. How further we move? By a fixed proportion of this derivative, $\\alpha$ called the *learning rate* \n", "\n", "$$\\beta_i = \\beta_i - \\alpha \\frac{\\delta}{\\delta \\beta_i} J(\\beta_0, \\beta_1) \\quad i=0,1$$\n", "\n", "How do we apply gradient descent to minimize the mean squared error cost function? We plug our $J$ into the parameter update function and we take the derivative. \n", "\n", "$$\\frac{\\delta}{\\delta \\beta_0} J(\\beta_0, \\beta_1) = \\frac{\\delta}{\\delta \\beta_0}\\frac{1}{2n}\\sum_i^n [(\\beta_0 + \\beta_1 x_i)-y_i]^2 = \\frac{1}{n} \\sum_i^n (h(x_i)-y_i)$$\n", "$$\\frac{\\delta}{\\delta \\beta_1} J(\\beta_0, \\beta_1) = \\frac{\\delta}{\\delta \\beta_1}\\frac{1}{2n}\\sum_i^n [(\\beta_0 + \\beta_1 x_i)-y_i]^2 = \\frac{1}{n} \\sum_i^n (h(x_i)-y_i) x_i$$\n", "\n", "Actually the graph for the cost function for the linear regression is a convex function that always return the global minima. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.5 Multiple features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "At the moment we are trying to predict an outcome based on a single input variable (or feature). What if we have multiple variables as input? Does the accuracy of the model improve?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this case I can call the first four features as $x_1, x_2, x_3, x_4$ and the target (the price) is $y$. Here we introduce another concept in ML. The concept of dimensionality. The dimensionality of a dataset is the number of features that it has. We will denote the number of features of a dataset with the letter $p$. Here $p = 4$ (whereas in the previous example $p=1$). To simplify, $n$ is the number of rows in the table, while $p$ is the number of columns of the table.\n", "\n", "If we look at the previous notation, we can upgrade it so that now $x_i^{(j)}$ denotes the $i^{th}$ example and the $j^{th}$ feature. With multiple features the table is a matrix and each example $x_i$ is a $p$ dimensional vector. Talking again about notation, if we have multiple features our hypothesis become: \n", "\n", "$$ h(x) = \\beta_0 + \\beta_1x^{(1)} + \\beta_2x^{(2)} + \\ldots + \\beta_px^{(p)}$$\n", "\n", "Since it is no longer possible to use such notation to express the computation of the hypothesis we switch to the vectorized notation. We use a fake $x^{(0)}$ for the intercept that we set it to $1$ for each example so that our notation becomes:\n", "\n", "$$ h(x) = \\beta_0x^{(0)} + \\beta_1x^{(1)} + \\beta_2x^{(2)} + \\ldots + \\beta_px^{(p)}$$ \n", "\n", "where $x_i^{(0)} = 1 \\quad \\forall i$. This way we can simplify the expression of the hypothesis as a dot product betwee the input and the paramteres:\n", "\n", "$$ h(X) = \\beta^TX$$ \n", "\n", "How does the update rule for the gradient descent algorithm? We use vector notation for the parameter $\\beta$ that now denotes a $p$ dimensional vector, lile $x$. Since we added our fake $x_i^{(0)}$ we are going to use a single update rule for each parameter:\n", "\n", "$$\\beta_j = \\beta_j - \\alpha \\frac{\\delta}{\\delta \\beta_j} J(\\beta) = \\beta_j - \\alpha \\frac{1}{n} \\sum_i^n (h(x_i)-y_i) x_j \\quad \\forall i$$\n", "\n", "**Problems** with gradient descent:\n", "- you need to find the correct learning rate\n", "- you have to scale data to use it effectively\n", "- you need many iterations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.6 Normal equation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Instead of gradient descent we can solve for the vector of parameters analytically. Let's see how we can do it. We have switched to vectorized notation. Now the input is no longer a vector of length $n$ but a $n \\times p$ matrix. We can rewrite our hypothesis in matrix notation as:\n", "\n", "$$Y = XB$$\n", "\n", "where $Y$ is a $n$-dimensional vector and $B$ is a $p$-dimensional vector. Can we solve directly for $B$? Yes, we can obtain it from the residual sum of squares that we see at the beginning. \n", "\n", "$$RSS(\\beta) = \\sum_{i=1}^n (y_i - h(x_i))^2 = \\sum_{i=1}^n (y_i - \\sum_{j=1}^p(x_{ij}\\beta_j))^2 $$\n", "\n", "How to minimize this? If we switch to the matrix notation above we obtain the following equation:\n", "\n", "$$RSS(\\beta) = (y - X\\beta)^T(y - X\\beta)$$\n", "\n", "This is a quadratic function in the $p+1$ parameters. Differentiating with respect to $\\beta$ we obtain:\n", "\n", "$$\\frac{\\delta RSS(\\beta)}{\\delta \\beta} = -2X^T(y - X\\beta)$$\n", "\n", "Assuming (for the moment) that X has full column rank, and hence $X^T X$ is positive definite, if we set this derivative to zero we get:\n", "\n", "$$X^T(y - X\\beta) = 0$$\n", "\n", "we can obtain the solution: \n", "\n", "$$\\hat B = (X^T X)^{−1} X^T Y$$\n", "\n", "It might happen that the columns of X are not linearly independent, so that X is not of full rank. This would occur, for example, if two of the inputs were perfectly correlated, (e.g., x 2 = 3x 1 ). Then $X^T X$ is singular and the least squares coefficients $\\beta$ are not uniquely defined. This implies that the matrix is not invertible. It is possible to remove redundant features or use pseudo-inverse.\n", "\n", "**Problems** with Normal equation:\n", "- computing $(X^T X)^{−1}$ is done in $O(n^3)$\n", "- slow if $n$ is large" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3 Nearest Neighbors" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It is tempting to ask if there is an ideal $h(x)$. In particular whatis a good value of $h(x)$ at anly selected value of $x$? If the dataset has some noise, there can be many $y$ values for each $x$. An approximation for our target function can thus be:\n", "\n", "$$ h(x) = E(y \\mid x=v) $$\n", "\n", "So the correct value might be the expected value of the target at that particular point. The case of linear regression is a particular case where we might have a single value at each data point. This is an ideal form of *regression function*. $x$ can also be a vector. It is *ideal* because is the function that minimizes $E[(Y-g(x))^2 \\mid x=v]$ over all function $g$ at all points $x$. We are still making mistakes, called *irreducible error* (remember the noise in the measurements?). \n", "\n", "Typically we don't have many (sometimes we don't even have a value for each point), so we cannot compute $E(y \\mid x=v)$! What can we do? We can relax the definition and let\n", "\n", "$$ \\hat h(x) = Ave(y \\mid x \\in \\mathcal{N}(x)) $$\n", "\n", "where $\\mathcal{N}(x)$ is some neighborhood of $x$.\n", "\n", "This algorithm is called **Nearest Neighbour (NN)** (or local averaging). " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "(function(root) {\n", " function embed_document(root) {\n", " \n", " var docs_json = {\"094ab5a0-a94e-46fe-9dbd-dd4838d547ff\":{\"roots\":{\"references\":[{\"attributes\":{},\"id\":\"f665956b-5d3f-4753-b364-b7973ac9314b\",\"type\":\"LinearScale\"},{\"attributes\":{\"formatter\":{\"id\":\"3c94c329-e80f-48b8-b904-29379bce1cd8\",\"type\":\"BasicTickFormatter\"},\"minor_tick_out\":0,\"plot\":{\"id\":\"b21a6594-5111-498b-9803-eb938f70ce18\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"e5bfdf05-57af-438d-ac5c-21d701dc1264\",\"type\":\"BasicTicker\"}},\"id\":\"c05aa28f-f6a4-47f3-b84f-a09bb71cf79e\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"d9780aa9-e13f-4b52-82a8-94b27f53cce5\",\"type\":\"BasicTicker\"},{\"attributes\":{\"source\":{\"id\":\"79535488-c9d4-474a-a5f5-5dbe155a7ef7\",\"type\":\"ColumnDataSource\"}},\"id\":\"89c1a739-7332-4144-a528-622634571ff3\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"3c94c329-e80f-48b8-b904-29379bce1cd8\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"formatter\":{\"id\":\"1316f6c4-56a9-407d-acbc-b2d65827b7be\",\"type\":\"BasicTickFormatter\"},\"minor_tick_out\":0,\"plot\":{\"id\":\"b21a6594-5111-498b-9803-eb938f70ce18\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"d9780aa9-e13f-4b52-82a8-94b27f53cce5\",\"type\":\"BasicTicker\"}},\"id\":\"cfd7186d-f71c-4cc9-a71e-d15e2965bc50\",\"type\":\"LinearAxis\"},{\"attributes\":{\"callback\":null,\"column_names\":[\"x\",\"y\"],\"data\":{\"x\":{\"__ndarray__\":\"MqNhARb47z/xq5BPfCXwPwoV11+YwH8/+OB1NJ4u6j+UnUAMN7TuvwI9nTkwj/i/mg4SYvTd8D8upzHE0Hz6P4Iha/CxjvS/IWCvl2sc9r/wh7HgDBjIPxhfXwUXf/G/9lsYNUWx4T+PiounKkL6P4XiHNE6/vy/WFkCwlYM4L81AycenmnnP9mV7eZVxsK/tVQpYCFa8L8wNELzK++wP6hvNYGW6Na/GTzlruiq978b9nYKuJ7tP61nUWXKweo/WrRpmaFT47/lqMgGVHf0P0KcFdSIBwBA4wlsELGI4D+vhc/9qVfSv09M47hI1QlAo5i2DiSj8r8SgTPnrPTwP5AKD3ZoEsQ/8MDx3O2W1z8zkiKLHunKPx9eMUTM85g/KLMYS8mt4D/yeW3e4vTPP6t4atNROPW/2YUTzmAZ8D+qEaaB1nfuv5LiWYyX7c6/nZuf9OsK+D+04b42qdjVPwZG8ZaijNc/D7ELYtSt3D/gHC4CZEDov4k85SuoqvI/Huhf90Ksqr+3hGfOUqzfP5CTPjhXKem/u31ZcZR0z7/kkLxWjvzmv7KKfsNYNe6/gpJ4EQ5jy7/yTNz1JuXNv7H+7WuzVAPAy5bBgvjeyb+igHDcolzWP+TQDa2wp/I/0HqaRczU6r/bgPzJBcn7vxrL5y0lrfC/MDqqh/8Z8j8LFNUNlt74P18VC34N/No//YiOjl5Z+L/jW0APwgLwPyHRzDjse8m/urB9OU/zAsCCVwZfS9pvv6JuW36dtuQ/yLbpi8t//T9//7VBbpC9v4cCOncTKPY/U0kQqyuQwb8tUEqGLjzzv9LMEzwpN/c/FmZ1NG6c0b+YjH9ar3bsvxKIRq1GyNg/b6GxptwC1z+PT4uHrjbmvytM9iksStE/m+DoE+hT9L9mgjBdBvnyv78iGoBbqeG/lx0Mxuc/9j9t7ZVqclz3v1UYLCeFPvS/bSbjGLbQwD8rEI4ili/mv6irRA6xIOW/bdZ6NmUB+T81Jru8cAAAwG3pvW24qT2/TTH4IuJKAEBMtV/aW4zxP0RUTZjVYao/ckpVmKLO5j8=\",\"dtype\":\"float64\",\"shape\":[100]},\"y\":{\"__ndarray__\":\"HKyN58TfNkAqcCmmHGtLQKfLccn0vDHAnJ8AJl76UUCsp1WUWqljwKIHgeqGVlTA9tbCX+HeWECBjYHasbNiQMvjOqK1F2PAThyQJb67YcBZ4Wa5fTQwwL2avafZXlzAbqrnAhfFU0CcutvIKJ5lQFr5kqGx7WbA/ai+4iWOQcByzvurxLlTQIb6JFZ6ODzAq67P/ymQWcCuaCNKwAszQLyF4yx2oSzANppNmW8ZTMB4JDaGTshmQJD3pI1nKjpAiGLwGul6QsAsCuZOEmJJQLbz+3ePA2BALD6q3e1UIsBvuY2rU2w5QIxSsCAlTHFANOunMO9eQcBX3cFVYDFCQBCS7xah2v2/MrpXblTTQ8A7cj5YFnQ+wDrbJxVw3yJAf5IsVPcDQUBR9iclGOtSQGJbQcoLklbAKRi7qq96VEDGtPWytmhbwBDpl7XbMiZAj2cryny2ZEA8zFgguD4dQA4lMw3Sp0JAkXU7KBQKREB+MWh9I/c3wAFQEbuxCVZATYR9CJcUNMBJbCk9U+05QOiHOMpwYFfANjzQ/RnrRMAalzQSgA47wGahuGE0QUzA9QftGJJCSsAufSsCT9EzQF+C1gciIGPAKdtpRq9mRcB6l4qBNQE7QGANk/m4lVdA8JzBNdyXWcCHjH6vBBFfwGCdfP3y1VnAcOBJh7WLX0Damn+bvFdSQLj6ywMM2ynACZ5AoSPlXsA2E/oDqF9JQO622A9LPUjA/NDJp/X7acA5oAWyoS8yQDOe6hzkMlRAT+338HNiZ0DZMTKDvawxQJ4hHQ/VS1pApHlcEj2oN8AG+Iht9PlfwK3rJrF5p2FAPDlm1osBHkA3Yu/Y6WhWwCbbCjvnbDBAbed7Zn1fS0DQ+9q3LoNVwL/bJgGUIFRAE/3bQeVgZ8BgKfH7YGIwwIm5h0QbsTLA8CaETUYHXEBcs+iFvLZdwE4/qWmmAk/AbvyD5A1mOMCN9V8iiQZAwF5pxKBjK1nAWz8lSoX7YUASnd7VmkhlwN7w6vNNBDdAQZ2OKZUzaUAFU9iuNzFFQIGj7QzEVT3AmFqPMWWtV0A=\",\"dtype\":\"float64\",\"shape\":[100]}}},\"id\":\"19e428c3-7ed8-4cab-b464-f1ca0eea8ff6\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"02356f58-7e09-44d8-b07e-7c49b9d7aa07\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"24da6006-8d54-47e6-b88b-6dd9ae849505\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"6af4eb5c-a85d-477b-98aa-620cde806c25\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"overlay\":{\"id\":\"82fabb9b-587f-48d0-a6c7-4177c948b096\",\"type\":\"BoxAnnotation\"}},\"id\":\"4515d072-6e4d-4b04-babd-4b62aaf7dce8\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1316f6c4-56a9-407d-acbc-b2d65827b7be\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"line_color\":\"red\",\"line_width\":3,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"f1eca4dc-49bf-49c7-b0c8-276cb65f05ef\",\"type\":\"Line\"},{\"attributes\":{},\"id\":\"7a1ff284-0c57-4ba8-b3cf-03e09e6c6608\",\"type\":\"PanTool\"},{\"attributes\":{\"callback\":null},\"id\":\"f6fa9534-470f-4097-b827-bdff185e2dde\",\"type\":\"DataRange1d\"},{\"attributes\":{\"data_source\":{\"id\":\"79535488-c9d4-474a-a5f5-5dbe155a7ef7\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"f1eca4dc-49bf-49c7-b0c8-276cb65f05ef\",\"type\":\"Line\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"66cc4c8e-b7de-40a3-ba89-39ffb2f4941b\",\"type\":\"Line\"},\"selection_glyph\":null,\"view\":{\"id\":\"89c1a739-7332-4144-a528-622634571ff3\",\"type\":\"CDSView\"}},\"id\":\"91c1038f-0374-4671-9f87-e498af46761a\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"lightgrey\"},\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":{\"value\":1.0},\"line_color\":{\"value\":\"black\"},\"line_dash\":[4,4],\"line_width\":{\"value\":2},\"plot\":null,\"render_mode\":\"css\",\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"82fabb9b-587f-48d0-a6c7-4177c948b096\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"fill_color\":{\"value\":\"black\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"9fb12c21-a4d8-4b6e-878c-6f37f0664ac4\",\"type\":\"Circle\"},{\"attributes\":{\"data_source\":{\"id\":\"19e428c3-7ed8-4cab-b464-f1ca0eea8ff6\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"9fb12c21-a4d8-4b6e-878c-6f37f0664ac4\",\"type\":\"Circle\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"9e96a030-b230-47d3-ab7a-03910f131fb2\",\"type\":\"Circle\"},\"selection_glyph\":null,\"view\":{\"id\":\"32a0c11b-6e3d-4ab3-8307-9a234825b29b\",\"type\":\"CDSView\"}},\"id\":\"276483c6-6058-49d7-b93c-ac138ee954de\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":3,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"66cc4c8e-b7de-40a3-ba89-39ffb2f4941b\",\"type\":\"Line\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"9e96a030-b230-47d3-ab7a-03910f131fb2\",\"type\":\"Circle\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"7a1ff284-0c57-4ba8-b3cf-03e09e6c6608\",\"type\":\"PanTool\"},{\"id\":\"6af4eb5c-a85d-477b-98aa-620cde806c25\",\"type\":\"WheelZoomTool\"},{\"id\":\"4515d072-6e4d-4b04-babd-4b62aaf7dce8\",\"type\":\"BoxZoomTool\"},{\"id\":\"24da6006-8d54-47e6-b88b-6dd9ae849505\",\"type\":\"SaveTool\"},{\"id\":\"02356f58-7e09-44d8-b07e-7c49b9d7aa07\",\"type\":\"ResetTool\"},{\"id\":\"69f52008-ca33-4523-9633-6cc229f07ae3\",\"type\":\"HelpTool\"}]},\"id\":\"56b7b8f2-6e1e-4dc7-8d2a-017e9c40121f\",\"type\":\"Toolbar\"},{\"attributes\":{\"callback\":null,\"column_names\":[\"x\",\"y\"],\"data\":{\"x\":{\"__ndarray__\":\"sf7ta7NUA8C6sH05T/MCwDUmu7xwAADAheIc0Tr+/L/bgPzJBcn7vwI9nTkwj/i//YiOjl5Z+L8ZPOWu6Kr3v23tlWpyXPe/IWCvl2sc9r+reGrTUTj1v4Iha/CxjvS/m+DoE+hT9L9VGCwnhT70vy1QSoYuPPO/ZoIwXQb58r+jmLYOJKPyvxhfXwUXf/G/GsvnLSWt8L+1VClgIVrwv5SdQAw3tO6/qhGmgdZ37r+yin7DWDXuv5iMf1qvduy/0HqaRczU6r+Qkz44Vynpv+AcLgJkQOi/5JC8Vo785r+PT4uHrjbmvysQjiKWL+a/qKtEDrEg5b9atGmZoVPjv78iGoBbqeG/WFkCwlYM4L+obzWBlujWv6+Fz/2pV9K/FmZ1NG6c0b+7fVlxlHTPv5LiWYyX7c6/8kzc9Sblzb+CkngRDmPLv8uWwYL43sm/IdHMOOx7yb/Zle3mVcbCv1NJEKsrkMG/f/+1QW6Qvb8e6F/3Qqyqv4JXBl9L2m+/bem9bbipPb8KFddfmMB/Px9eMUTM85g/RFRNmNVhqj8wNELzK++wP20m4xi20MA/kAoPdmgSxD/wh7HgDBjIPzOSIose6co/8nlt3uL0zz8rTPYpLErRP7Thvjap2NU/ooBw3KJc1j9vobGm3ALXPwZG8ZaijNc/8MDx3O2W1z8SiEatRsjYP18VC34N/No/D7ELYtSt3D+3hGfOUqzfP+MJbBCxiOA/KLMYS8mt4D/2Wxg1RbHhP6JuW36dtuQ/ckpVmKLO5j81AycenmnnP/jgdTSeLuo/rWdRZcrB6j8b9nYKuJ7tPzKjYQEW+O8/41tAD8IC8D/ZhRPOYBnwP/GrkE98JfA/mg4SYvTd8D8SgTPnrPTwP0y1X9pbjPE/MDqqh/8Z8j/k0A2tsKfyP4k85SuoqvI/5ajIBlR39D+HAjp3Eyj2P5cdDMbnP/Y/0swTPCk39z+dm5/06wr4PwsU1Q2W3vg/bdZ6NmUB+T+PiounKkL6Py6nMcTQfPo/yLbpi8t//T9CnBXUiAcAQE0x+CLiSgBAT0zjuEjVCUA=\",\"dtype\":\"float64\",\"shape\":[100]},\"y\":{\"__ndarray__\":\"fE8c6yeWZ8DpfevOdwJnwN2R1HcMGmTAYQAi+yO9YMDLUmU14blgwCU1ONNGeVzApH1j4BhLXcBGBBZ0EThdwLf5YIQ8EFzAo3RlCp1/XcCu3hBUuIlXwFqWA6ppqVfAjBIMPzA2WsCbyOUfim9ZwP17+czNwVrA74jno38DVsDKDJih+tFVwOQel893a1fAgtJruRdqUcCNID76u5pRwP3kEfiz51LAYYfrRY93VMCBqdg+KfdSwCPuteUVrFHADaTEepVwS8D6dSWOkMVQwLfKo5GOmE7AvbdGUyDTSsCjJQ18VZBOwAPo53ud/k/A9DugwlOzRsDpUKkg/6RFwFSrQZXk60TAM9LS46YNQMCcEzuAAGM+wKpyGJDZGELAxz6AzmrzMMDnM+xJ9BY4wMGNY+gvNTTA0s887Fy6NMAcX91oVaEewK1oaoIZjyXAlLCPc08xMMBFkHLRockzwFa0L6vRwiDAIeF7ul8DJcCSYdiwsecTwNUEO5bU1f6/S8be6NvHI8AyIXcWvJwIQLUE4sfkyyNAG+UyBUWkKMBZIiUA07MAwEJEA/Rsgg5AnwvVxh3iMkDAw452GsY1QCrvxkhcazlAWf24bK1PN0BPCF8BejA9QFZe8aNIPEBANRTfgsLtP0C0SA0Mc5E+QERvDddUAztA4nj0+y44O0Dmui3Kgro5QNR32/diyEBA93/mRGkBSECE2thgXdE6QNv6o7dLI0RA0mHf5+nkR0DUGxl11vxFQGSghcjmnkpA9P/ji44dTEA5c2532xJQQDmOJG2lvVFAcWieEnC8UED9OcomaPtVQIk0LRNuz1RAAj6ytgZaUUB8KxUWhGlXQPLve10191NAxjUpPY08VUCdzm5E+GNRQBASd/qODFRAZUrqk7dqVkBijywpFORXQArLjUe7klNAJNYrJEYHWUDrKK1DPWRcQHbUTgmpAVpAY6O520wKYEBi/+HlMEVdQD7ZoIdoYF5Ab0E3zsAEXUBegYqM9uVfQA27TTzlZWBA4M+tbXrNYUBuAt9JlK9kQHBS5z7D0GNA6Rpz3Yvwb0A=\",\"dtype\":\"float64\",\"shape\":[100]}}},\"id\":\"79535488-c9d4-474a-a5f5-5dbe155a7ef7\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"below\":[{\"id\":\"cfd7186d-f71c-4cc9-a71e-d15e2965bc50\",\"type\":\"LinearAxis\"}],\"left\":[{\"id\":\"c05aa28f-f6a4-47f3-b84f-a09bb71cf79e\",\"type\":\"LinearAxis\"}],\"plot_height\":300,\"plot_width\":630,\"renderers\":[{\"id\":\"cfd7186d-f71c-4cc9-a71e-d15e2965bc50\",\"type\":\"LinearAxis\"},{\"id\":\"998bc982-12eb-48f4-a406-14e3f8a43d97\",\"type\":\"Grid\"},{\"id\":\"c05aa28f-f6a4-47f3-b84f-a09bb71cf79e\",\"type\":\"LinearAxis\"},{\"id\":\"584290fb-4248-4c4f-b4d4-04a2fbe4e5fa\",\"type\":\"Grid\"},{\"id\":\"82fabb9b-587f-48d0-a6c7-4177c948b096\",\"type\":\"BoxAnnotation\"},{\"id\":\"276483c6-6058-49d7-b93c-ac138ee954de\",\"type\":\"GlyphRenderer\"},{\"id\":\"91c1038f-0374-4671-9f87-e498af46761a\",\"type\":\"GlyphRenderer\"}],\"title\":null,\"toolbar\":{\"id\":\"56b7b8f2-6e1e-4dc7-8d2a-017e9c40121f\",\"type\":\"Toolbar\"},\"x_range\":{\"id\":\"f6fa9534-470f-4097-b827-bdff185e2dde\",\"type\":\"DataRange1d\"},\"x_scale\":{\"id\":\"d71a340d-58ee-4100-980d-dceb1f51fad1\",\"type\":\"LinearScale\"},\"y_range\":{\"id\":\"71a685a4-a95f-4e07-bf79-b50a2ab791c7\",\"type\":\"DataRange1d\"},\"y_scale\":{\"id\":\"f665956b-5d3f-4753-b364-b7973ac9314b\",\"type\":\"LinearScale\"}},\"id\":\"b21a6594-5111-498b-9803-eb938f70ce18\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"callback\":null},\"id\":\"71a685a4-a95f-4e07-bf79-b50a2ab791c7\",\"type\":\"DataRange1d\"},{\"attributes\":{\"source\":{\"id\":\"19e428c3-7ed8-4cab-b464-f1ca0eea8ff6\",\"type\":\"ColumnDataSource\"}},\"id\":\"32a0c11b-6e3d-4ab3-8307-9a234825b29b\",\"type\":\"CDSView\"},{\"attributes\":{\"dimension\":1,\"grid_line_color\":{\"value\":null},\"plot\":{\"id\":\"b21a6594-5111-498b-9803-eb938f70ce18\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"e5bfdf05-57af-438d-ac5c-21d701dc1264\",\"type\":\"BasicTicker\"}},\"id\":\"584290fb-4248-4c4f-b4d4-04a2fbe4e5fa\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"e5bfdf05-57af-438d-ac5c-21d701dc1264\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"d71a340d-58ee-4100-980d-dceb1f51fad1\",\"type\":\"LinearScale\"},{\"attributes\":{\"grid_line_color\":{\"value\":null},\"plot\":{\"id\":\"b21a6594-5111-498b-9803-eb938f70ce18\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"d9780aa9-e13f-4b52-82a8-94b27f53cce5\",\"type\":\"BasicTicker\"}},\"id\":\"998bc982-12eb-48f4-a406-14e3f8a43d97\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"69f52008-ca33-4523-9633-6cc229f07ae3\",\"type\":\"HelpTool\"}],\"root_ids\":[\"b21a6594-5111-498b-9803-eb938f70ce18\"]},\"title\":\"Bokeh Application\",\"version\":\"0.12.13\"}};\n", " var render_items = [{\"docid\":\"094ab5a0-a94e-46fe-9dbd-dd4838d547ff\",\"elementid\":\"5d5fd1a3-b842-4c5c-a2e6-afe0c53192c5\",\"modelid\":\"b21a6594-5111-498b-9803-eb938f70ce18\"}];\n", " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", "\n", " }\n", " if (root.Bokeh !== undefined) {\n", " embed_document(root);\n", " } else {\n", " var attempts = 0;\n", " var timer = setInterval(function(root) {\n", " if (root.Bokeh !== undefined) {\n", " embed_document(root);\n", " clearInterval(timer);\n", " }\n", " attempts++;\n", " if (attempts > 100) {\n", " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\")\n", " clearInterval(timer);\n", " }\n", " }, 10, root)\n", " }\n", "})(window);" ], "application/vnd.bokehjs_exec.v0+json": "" }, "metadata": { "application/vnd.bokehjs_exec.v0+json": { "id": "b21a6594-5111-498b-9803-eb938f70ce18" } }, "output_type": "display_data" } ], "source": [ "n_neighbors = 30\n", "sample = 1000\n", "\n", "knn = neighbors.KNeighborsRegressor(n_neighbors, weights='uniform')\n", "knn.fit(X, y)\n", "\n", "fig = bk.figure(plot_width=630, plot_height=300, title=None)\n", "fig.circle(X[::sample, 0], y[::sample], color='black')\n", "a = np.hstack([np.atleast_2d(X[::sample, 0]).T, \n", " np.atleast_2d(knn.predict(X[::sample])).T])\n", "a = a[a[:,0].argsort()]\n", "fig.line(a[:,0], a[:,1], color='red', line_width=3)\n", "fig.grid.grid_line_color = None\n", "fig.axis.minor_tick_out = 0\n", "bk.show(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Albeit it is a pretty cool idea, it does not work in higher dimension. This introduces a major topic in ML." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3.1 The Curse of Dimensionality" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Nearest Neighbor averaging can be pretty good for small $p$ (tipycally $p \\leq 4$) and larger $n$. But what happens when $p$ is large? This method can be pretty lousy. The reason is the curse of dimensionality. Nearest Neighbors tend to be far away in high dimensions. \n", "\n", "We need to get a reasonable fraction of the N values of $y_i$ to average to bring the variance down. A $10\\%$ neighborhood in high dimensions need no longer be local, so we lose the spirit of estimating $E(y \\mid x=v)$ by local averaging. \n", "\n", "For a better understanding take a look at the picture below." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "So let's look at a little example of that. In the left panel, we've got values of two variables, $x1$ and $x2$, that are uniformly distributed in this cube with edges $-1$ to $+1$, $-1$ to $+1$. We form two 10% neighborhoods in this case. The first neighborhood is just involving the variable $x1$, ignoring $x2$. \n", "\n", "That's indicated by the vertical dotted lines. Our target point is at $0$. And we spread out a neighborhood to the left and right until we capture $10\\%$ of the data points with respect to the variable $x1$. The dotted line indicates the width of the neighborhood.\n", "\n", "Alternatively, if we want to find a neighborhood in two dimensions, we spread out a circle centered at the target point, which is the red dot there, until we've captured $10\\%$ of the points. Notice the radius of the circle in two dimensions? It is much bigger than the radius of the circle in one dimension which is just the width between these two dotted lines. To capture $10\\%$ of the points in two dimensions, we have to go out further and we are less local than we are in one dimension! \n", "\n", "We can take this example further. On the right hand plot, we can see far you have to go out in one, two, three, five, and ten dimensions. In ten dimensions, these are different versions of this problem as the dimensions get higher, in order to capture a certain fraction of the volume. Take, for example, $10\\%$ or $0.1$ fraction of the volume. For $p=1$, if the data is uniform, you roughly go out $10\\%$ of the distance. In two dimensions, we showed you went more. Look what happens in five dimensions. In five dimensions, you have to go out to about $0.9$ on each coordinate axes to get $10\\%$ of the data. That's just about the whole radius of this sphere. And in ten dimensions, you actually have to go break out of this sphere in order to get points in the corner to capture the $10\\%$. The take home lesson here is: it's really hard to find new neighborhoods in high dimensions and stay local! If we didn't have this issue we could use NN for every problem!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4 Linear Models for classification" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In classification the output that we want to predict is a discrete value that has typically few values that we call **classes**. Recall the example that we did before: \n", "- email: spam/not spam\n", "- transaction: fraudolent/not fraudolent\n", "- tumor: malignant/benign\n", "\n", "we can assign numerical values to those classes and our target variable becomes $y \\in (0,1)$ where $0$ is the *negative class* (not spam) and $1$ is the *positive class* (spam). \n", "\n", "Let's see if we can apply simple linear regression to one of this problems. For example we want to learn if a tumor is malignant or not based on its size. Suppose that we see the following example, that is our dataset is composed of various tumor measurements and to each one is associated the value $1$ malignant and $0$ benign. Suppose that I find an hypothesis such that is represented by the line in the picture. How can I decide to which class each example belongs?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I can use a threshold at $0.5$ for example such that: \n", "- if $h(x) \\geq 0.5$ predict $1$ \n", "- if $h(x) < 0.5$ predict $0$\n", "\n", "what happens if we change the dataset a little bit? Let's say we add one more example to the right." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What happened? Linear regression changed because of the single value, and using our threshold of $0.5$ now gives wrong results. How can we overcome this kind of situations? Is it possible to use a different algorithm that estimates the class correctly? Linear regression output can be $>1$ and $<0$ while in a two-class classification problem the class can be $0$ or $1$. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.1 Logistic Regression" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The logistic regression model arises from the desire to model the posterior probabilities of the $K$ classes via linear functions in $x$, while at the same time ensuring that they sum to one and remain in $[0, 1]$.\n", "\n", "We apply the logistic function to hour hypothesis. What is the logistic function? " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$$g(z) = \\frac{1}{1+e^{-z}} $$\n", "\n", "We can apply this function to our previous linear regression hypothesis:\n", "\n", "$$ h(x) = g(\\beta^Tx)$$\n", "\n", "Now the $z$ in the previous function has now the value of $\\beta^Tx$ and the output of my hypothesis is no longer the value of $\\hat y$ but the probability of being in that class that is $P(Y=c \\mid X=x)$, given $x$ parametrized by $\\beta$." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.2 Decision Boundary" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Why the logistic regression is a linear model if its function is clearly non linear? Let's discuss how the class is computed. \n", "\n", "Again we are using a threshold, suppose to predict $y=1$ if $h(x) \\geq 0.5$ and $y=1$ if $h(x) < 0.5$. In this case $g(z) \\geq 0.5$. But when it is such the case? $g(z)$ is $\\geq 0.5$ when $z \\geq 0$, so basically when $\\beta^Tx \\geq 0$.\n", "\n", "What happens in the other case? $y=0$ if $h(x) < 0.5$. In this case $g(z) < 0.5$ when $z < 0$, so basically when $\\beta^Tx < 0$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Suppose that we are looking at the region boundary in the picture above. In this case the line that divides the two planes has equation $-3+x_1+x_2 = 0$. In this case we are classifying the examples with class 1 when $x_1 + x_2 \\geq 3$ and we are classifying examples with class 0 when $x_1 + x_2 < 3$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.3 Cost function" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using mean squared error as the cost for the logistic gives us a non-convex function in the parameter $\\beta$. What do we mean for non convex? See the picture below! This happens because the $h(x)$ in the cost function has nonlinearities (the logistic function). In this way it is really hard to find the global minimum of the error function and thus the optimal solution. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We need another cost function.\n", "\n", "$$J(h(x),y) = \\Bigg\\{\\begin{array}{lr}\n", " -log(h(x)) & if y=1\\\\\n", " -log(1-h(x)) & if y=0\n", " \\end{array}$$\n", " \n", "Let's see if it works. If $y=1$ and $h(x)=1$ the cost is zero. So if the model is predicting the correct label the error is zero as expected. If however $y=1$ and $h(x)=0$ the cost is $\\inf$ because we have $-log(0)$. This capture the notion that if we are making a mistake we are going to penalize the model by a large quantity. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The same applies for the case where $y=0$." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "(function(root) {\n", " function embed_document(root) {\n", " \n", " var docs_json = {\"9f0a6e9d-b850-4370-9a57-3c308b74d32a\":{\"roots\":{\"references\":[{\"attributes\":{},\"id\":\"04906994-e9d2-45c8-bd08-2c31df8857ca\",\"type\":\"HelpTool\"},{\"attributes\":{\"items\":[{\"id\":\"19f72b6d-984b-4bed-8e55-1e9e4611b8fe\",\"type\":\"LegendItem\"},{\"id\":\"0b47bfc4-806a-4bbb-9d50-b23fbf7adac0\",\"type\":\"LegendItem\"}],\"location\":\"bottom_right\",\"plot\":{\"id\":\"9a5a26cf-56fe-43b9-add6-727a968979a3\",\"subtype\":\"Figure\",\"type\":\"Plot\"}},\"id\":\"010cac3e-a766-41ef-a97c-b6ea31a8b845\",\"type\":\"Legend\"},{\"attributes\":{},\"id\":\"6b9a559a-8614-4acf-986d-b60ff77bfcee\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"callback\":null,\"column_names\":[\"x\",\"y\"],\"data\":{\"x\":{\"__ndarray__\":\"AAAAAAAAFMAC+pT2oMwTwAP0Ke1BmRPABe6+4+JlE8AH6FPagzITwAni6NAk/xLACtx9x8XLEsAM1hK+ZpgSwA7Qp7QHZRLAD8o8q6gxEsARxNGhSf4RwBO+ZpjqyhHAFLj7jouXEcAWspCFLGQRwBisJXzNMBHAGqa6cm79EMAboE9pD8oQwB2a5F+wlhDAH5R5VlFjEMAgjg5N8i8QwEQQR4cm+Q/ASARxdGiSD8BM+JphqisPwE7sxE7sxA7AUuDuOy5eDsBW1BgpcPcNwFnIQhaykA3AXLxsA/QpDcBgsJbwNcMMwGOkwN13XAzAZpjqyrn1C8BqjBS4+44LwG6APqU9KAvAcXRokn/BCsB0aJJ/wVoKwHhcvGwD9AnAe1DmWUWNCcB+RBBHhyYJwII4OjTJvwjAhixkIQtZCMCJII4OTfIHwIwUuPuOiwfAkAji6NAkB8CT/AvWEr4GwJfwNcNUVwbAmuRfsJbwBcCd2Imd2IkFwKHMs4oaIwXApMDdd1y8BMCotAdlnlUEwKuoMVLg7gPAr5xbPyKIA8CykIUsZCEDwLWErxmmugLAuXjZBuhTAsC8bAP0Ke0BwMBgLeFrhgHAw1RXzq0fAcDGSIG777gAwMo8q6gxUgDAmmGqK+fW/7+iSf4Fawn/v6gxUuDuO/6/sBmmunJu/b+2AfqU9qD8v7zpTW960/u/xNGhSf4F+7/KufUjgjj6v9KhSf4Fa/m/2Imd2Imd+L/gcfGyDdD3v+ZZRY2RAve/7EGZZxU19r/0Ke1BmWf1v/oRQRwdmvS/AvqU9qDM878I4ujQJP/yvw7KPKuoMfK/FrKQhSxk8b8cmuRfsJbwv0gEcXRoku+/WNQYKXD37b9gpMDdd1zsv3B0aJJ/weq/gEQQR4cm6b+IFLj7jovnv5jkX7CW8OW/qLQHZZ5V5L+4hK8Zprriv8BUV86tH+G/oEn+BWsJ37/A6U1vetPbv9CJndiJndi/8CntQZln1b8QyjyrqDHSv2DUGClw982/gBS4+46Lx7/AVFfOrR/BvwAq7UGZZ7W/gFRXzq0fob8AVVfOrR+RP8BUV86tH7E/QNQYKXD3vT8AKu1BmWfFP8DpTW9608s/wFRXzq0f0T+wtAdlnlXUP5AUuPuOi9c/cHRokn/B2j9g1BgpcPfdPyCa5F+wluA/EMo8q6gx4j8A+pT2oMzjP/gp7UGZZ+U/6FlFjZEC5z/YiZ3YiZ3oP9C59SOCOOo/wOlNb3rT6z+wGaa6cm7tP6BJ/gVrCe8/zDyrqDFS8D/EVFfOrR/xP7xsA/Qp7fE/uISvGaa68j+wnFs/IojzP6i0B2WeVfQ/oMyzihoj9T+c5F+wlvD1P5T8C9YSvvY/jBS4+46L9z+ILGQhC1n4P4BEEEeHJvk/eFy8bAP0+T9wdGiSf8H6P2yMFLj7jvs/ZKTA3Xdc/D9cvGwD9Cn9P1jUGClw9/0/UOzETuzE/j9IBHF0aJL/PyCODk3yLwBAHprkX7CWAEAaprpybv0AQBaykIUsZAFAFL5mmOrKAUAQyjyrqDECQAzWEr5mmAJACOLo0CT/AkAG7r7j4mUDQAL6lPagzANA/gVrCV8zBED8EUEcHZoEQPgdFy/bAAVA9CntQZlnBUDyNcNUV84FQO5BmWcVNQZA6k1vetObBkDmWUWNkQIHQORlG6BPaQdA4HHxsg3QB0DcfcfFyzYIQNiJndiJnQhA1JVz60cECUDUoUn+BWsJQNCtHxHE0QlAzLn1I4I4CkDIxcs2QJ8KQMTRoUn+BQtAwN13XLxsC0C86U1vetMLQLz1I4I4OgxAuAH6lPagDEC0DdCntAcNQLAZprpybg1ArCV8zTDVDUCoMVLg7jsOQKQ9KPOsog5ApEn+BWsJD0CgVdQYKXAPQJxhqivn1g9AzDZAn9IeEEDKPKuoMVIQQMhCFrKQhRBAxkiBu++4EEDGTuzETuwQQMRUV86tHxFAwlrC1wxTEUDAYC3ha4YRQL5mmOrKuRFAvGwD9CntEUC6cm79iCASQLp42QboUxJAuH5EEEeHEkC2hK8ZproSQLSKGiMF7hJAspCFLGQhE0CwlvA1w1QTQK6cWz8iiBNArqLGSIG7E0CsqDFS4O4TQKqunFs/IhRAqLQHZZ5VFECmunJu/YgUQKTA3XdcvBRAosZIgbvvFECizLOKGiMVQKDSHpR5VhVAntiJndiJFUCc3vSmN70VQJrkX7CW8BVAmOrKufUjFkCW8DXDVFcWQJb2oMyzihZAlPwL1hK+FkCSAnffcfEWQJAI4ujQJBdAjg5N8i9YF0CMFLj7josXQIwaIwXuvhdAiiCODk3yF0CIJvkXrCUYQIYsZCELWRhAhDLPKmqMGECCODo0yb8YQIA+pT0o8xhAgEQQR4cmGUB+SntQ5lkZQHxQ5llFjRlAelZRY6TAGUB4XLxsA/QZQHZiJ3ZiJxpAdGiSf8FaGkB0bv2III4aQHJ0aJJ/wRpAcHrTm970GkBugD6lPSgbQGyGqa6cWxtAaowUuPuOG0Bokn/BWsIbQGiY6sq59RtAZp5V1BgpHEBkpMDdd1wcQGKqK+fWjxxAYLCW8DXDHEBetgH6lPYcQFy8bAP0KR1AXMLXDFNdHUBayEIWspAdQFjOrR8RxB1AVtQYKXD3HUBU2oMyzyoeQFLg7jsuXh5AUOZZRY2RHkBQ7MRO7MQeQE7yL1hL+B5ATPiaYaorH0BK/gVrCV8fQEgEcXRokh9ARgrcfcfFH0BEEEeHJvkfQCILWchCFiBAIY4OTfIvIEAgEcTRoUkgQB+UeVZRYyBAHhcv2wB9IEAdmuRfsJYgQBwdmuRfsCBAHKBPaQ/KIEAbIwXuvuMgQBqmunJu/SBAGSlw9x0XIUAYrCV8zTAhQBcv2wB9SiFAFrKQhSxkIUAWNUYK3H0hQBW4+46LlyFAFDuxEzuxIUATvmaY6sohQBJBHB2a5CFAEcTRoUn+IUAQR4cm+RciQBDKPKuoMSJAD03yL1hLIkAO0Ke0B2UiQA1TXTm3fiJADNYSvmaYIkALWchCFrIiQArcfcfFyyJACl8zTHXlIkAJ4ujQJP8iQAhlnlXUGCNAB+hT2oMyI0AGawlfM0wjQAXuvuPiZSNABHF0aJJ/I0AE9CntQZkjQAN333HxsiNAAvqU9qDMI0ABfUp7UOYjQAAAAAAAACRA\",\"dtype\":\"float64\",\"shape\":[300]},\"y\":{\"__ndarray__\":\"9VHhE7K0qzxF1ScfJZezPDQBv8xGtLs8FWhcRNmWwzxSEjyH27PLPOewtmqNltM8dX9YQ3Cz2zyWqzaSQZbjPOFBFAEFs+s8iFPcuvWV8zy4U2/AmbL7PB2kp+SplQM9bq5pgS6yCz07mZgPXpUTPYdLA0TDsRs94y2vOxKVIz3EJDwIWLErPdZd62jGlDM9RTMUzuywOz0dJE2XepRDPTpwi5WBsEs9dHvUxi6UUz2I06FeFrBbPRBegffik2M9WlRXKauvaz1KxFMpl5NzPRnmq/U/r3s9A6RLXEuTgz2gdp/D1K6LPZfsaJD/kpM91Ocxk2mumz26gavFs5KjPVgDY2T+ras9MS8T/GeSsz1jYzI3k627Pa2QnzMcksM9CUKfCyityz164k9s0JHTPVAaqOG8rNs9JKEipoSR4z1p6Em5UazrPQ3LFOE4kfM9LKt+kuar+z2/YCAd7ZADPuxmOm17qws+p2g5WqGQEz5uK2VJEKsbPrP0R5hVkCM+nR/PJqWqKz5ULhzXCZAzPpeaGAU6qjs+IHBWFr6PQz78VoLjzqlLPmt5N1Vyj1M+s9uNwGOpWz6R2ECSJo9jPuxTPpn4qGs+RMZ1ytqOcz7ISppnjah7Pprm3PeOjoM+OjmvHyKoiz7v4oMOQ46TPt/RmKm2p5s+M82G9vaNoz629o/VSqerPrhFH4CqjbM+DmcJRN6muz4FjsNLXY3DPn619DVwpss+DVhmmg6N0z4esz0t/6XbPuck+u28jOM+4X/XLYil6z6MP4FKZYzzPpYC6D8Fpfs+MBdHuAGMAz/SF110aqQLP2ZloUiGixM/9KLh7p+jGz/T830g24ojP5PDlfx1ois/YQ9PkdCJMz/pkfxLjaA7P58ua1kHiEM/INhLhSedSz+a4VddwYRTP/hJhRnJlls/CnbH+IN+Yz+aB0lRf4prP/RHgg1gcnM/BW4UBnZyez+9zVKTj1qDP5HulpZIQ4s/PcQUxeQrkz8PYEQ7M+eaP90hbANs0aI/SWsX9oI2qj/BARdcYiayP4K/VT9Y77g/OCUE9ozywD+m2dvdfrjGP/2cjyM37s0/wCdWAHlL0z/gLYX23UHYP1JWSzxMpt0/oci5JPuW4T9slLQCFkPkP48azKBls+Y/NHkPlALQ6D9dd1vaRo/qPyhs8K6P8+s/P5Ibb+cG7T99fMQSwdbtP5hma/vjcO4/9TYliLHh7j/355F2czPvP4eUQ89Kbu8/iYXRLG2Y7z9l6gXzfLbvPwE0cHbhy+8/E1R4ghPb7z9YASIM2+XvP8vE2sx+7e8/VYg1+ufy7z/Xh/qUvPbvP/nslIxy+e8/DnvWjF377z8kxbLjuPzvP5CjeZKu/e8/TNMCVlz+7z/sQAA61/7vP3/UdCIu/+8/i2cRmGv/7z8/+0sOl//vPzxMPcq1/+8/CcMEhsv/7z+N8WPk2v/vP/RDh8Ll/+8/Sl7fce3/7z/9cgvh8v/vP7ARybj2/+8/zPlqcPn/7z/z5VFc+//vP+xyKLj8/+8/bswfrv3/7z/lyQ1c/v/vP6JdC9f+/+8/frIDLv//7z8tbYNr///vP1lBAJf//+8/bpbAtf//7z8KVX/L///vP0u739r//+8/ykS/5f//7z+5lW/t///vP4tv3/L//+8/6aa39v//7z9Znm/5///vP3vBW/v//+8/dsK3/P//7z/A1639///vP+DaW/7//+8/YufW/v//7z9C6i3////vP3Jxa////+8/ivOW////7z+at7X////vP/p4y////+8/PNva////7z8YvOX////vP1Zt7f///+8/2N3y////7z+Itvb////vP9Ju+f///+8/Mlv7////7z9ct/z////vP5Ct/f///+8/qFv+////7z/E1v7////vP9At/////+8/YGv/////7z/mlv/////vP661/////+8/csv/////7z/W2v/////vP7jl/////+8/bO3/////7z/c8v/////vP7b2/////+8/bvn/////7z9a+//////vP7b8/////+8/rv3/////7z9c/v/////vP9b+/////+8/Lv//////7z9s///////vP5b//////+8/tv//////7z/M///////vP9r//////+8/5v//////7z/u///////vP/L//////+8/9v//////7z/6///////vP/z//////+8//P//////7z/+///////vP/7//////+8//v//////7z8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/\",\"dtype\":\"float64\",\"shape\":[300]}}},\"id\":\"336fba80-8878-408e-a06c-be35a8dfd394\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"line_color\":\"red\",\"line_width\":3,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"d0579044-02cb-463d-9923-cb516107ff12\",\"type\":\"Line\"},{\"attributes\":{\"source\":{\"id\":\"336fba80-8878-408e-a06c-be35a8dfd394\",\"type\":\"ColumnDataSource\"}},\"id\":\"da80ec0e-2776-4b95-8c44-39e3b377748f\",\"type\":\"CDSView\"},{\"attributes\":{\"callback\":null,\"end\":10,\"start\":-4},\"id\":\"a76608d9-67e8-4fd0-9f3d-66f8e03831f8\",\"type\":\"Range1d\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":3,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"76719c42-52c9-4c43-96de-7cb36dca7786\",\"type\":\"Line\"},{\"attributes\":{\"data_source\":{\"id\":\"336fba80-8878-408e-a06c-be35a8dfd394\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"d0579044-02cb-463d-9923-cb516107ff12\",\"type\":\"Line\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"76719c42-52c9-4c43-96de-7cb36dca7786\",\"type\":\"Line\"},\"selection_glyph\":null,\"view\":{\"id\":\"da80ec0e-2776-4b95-8c44-39e3b377748f\",\"type\":\"CDSView\"}},\"id\":\"d4aa6622-04a4-4e55-9e5e-eef8d3de727f\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"line_color\":\"#1f77b4\",\"line_width\":3,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"10eab1f1-328b-4038-9d6a-1924f7d9220c\",\"type\":\"Line\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"f6a25900-dd02-4a32-a63a-7320ef257f30\",\"type\":\"PanTool\"},{\"id\":\"0f61e146-92b6-4d80-8b6a-6c0193cca2a4\",\"type\":\"WheelZoomTool\"},{\"id\":\"b592f1d6-ff6e-456d-8a1b-abc072d69893\",\"type\":\"BoxZoomTool\"},{\"id\":\"611e7801-7ded-4afe-85cb-8625da372170\",\"type\":\"SaveTool\"},{\"id\":\"5ed5ea00-0730-43b9-b232-26b9558ded86\",\"type\":\"ResetTool\"},{\"id\":\"04906994-e9d2-45c8-bd08-2c31df8857ca\",\"type\":\"HelpTool\"}]},\"id\":\"4a8a4238-eac5-4933-9a3c-12d28ec32661\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"06bc065d-53af-41fb-a429-0552f14fae3f\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"label\":{\"value\":\"Linear Regression Model\"},\"renderers\":[{\"id\":\"8a71e5f0-fe5b-4daa-b260-bdd79a9b69ce\",\"type\":\"GlyphRenderer\"}]},\"id\":\"0b47bfc4-806a-4bbb-9d50-b23fbf7adac0\",\"type\":\"LegendItem\"},{\"attributes\":{\"source\":{\"id\":\"be16d460-c01a-410a-903d-0e3fb2c1855e\",\"type\":\"ColumnDataSource\"}},\"id\":\"260a0f26-1eef-403d-8183-6969b5f7923c\",\"type\":\"CDSView\"},{\"attributes\":{\"callback\":null,\"column_names\":[\"x\",\"y\"],\"data\":{\"x\":{\"__ndarray__\":\"AAAAAAAAFMAC+pT2oMwTwAP0Ke1BmRPABe6+4+JlE8AH6FPagzITwAni6NAk/xLACtx9x8XLEsAM1hK+ZpgSwA7Qp7QHZRLAD8o8q6gxEsARxNGhSf4RwBO+ZpjqyhHAFLj7jouXEcAWspCFLGQRwBisJXzNMBHAGqa6cm79EMAboE9pD8oQwB2a5F+wlhDAH5R5VlFjEMAgjg5N8i8QwEQQR4cm+Q/ASARxdGiSD8BM+JphqisPwE7sxE7sxA7AUuDuOy5eDsBW1BgpcPcNwFnIQhaykA3AXLxsA/QpDcBgsJbwNcMMwGOkwN13XAzAZpjqyrn1C8BqjBS4+44LwG6APqU9KAvAcXRokn/BCsB0aJJ/wVoKwHhcvGwD9AnAe1DmWUWNCcB+RBBHhyYJwII4OjTJvwjAhixkIQtZCMCJII4OTfIHwIwUuPuOiwfAkAji6NAkB8CT/AvWEr4GwJfwNcNUVwbAmuRfsJbwBcCd2Imd2IkFwKHMs4oaIwXApMDdd1y8BMCotAdlnlUEwKuoMVLg7gPAr5xbPyKIA8CykIUsZCEDwLWErxmmugLAuXjZBuhTAsC8bAP0Ke0BwMBgLeFrhgHAw1RXzq0fAcDGSIG777gAwMo8q6gxUgDAmmGqK+fW/7+iSf4Fawn/v6gxUuDuO/6/sBmmunJu/b+2AfqU9qD8v7zpTW960/u/xNGhSf4F+7/KufUjgjj6v9KhSf4Fa/m/2Imd2Imd+L/gcfGyDdD3v+ZZRY2RAve/7EGZZxU19r/0Ke1BmWf1v/oRQRwdmvS/AvqU9qDM878I4ujQJP/yvw7KPKuoMfK/FrKQhSxk8b8cmuRfsJbwv0gEcXRoku+/WNQYKXD37b9gpMDdd1zsv3B0aJJ/weq/gEQQR4cm6b+IFLj7jovnv5jkX7CW8OW/qLQHZZ5V5L+4hK8Zprriv8BUV86tH+G/oEn+BWsJ37/A6U1vetPbv9CJndiJndi/8CntQZln1b8QyjyrqDHSv2DUGClw982/gBS4+46Lx7/AVFfOrR/BvwAq7UGZZ7W/gFRXzq0fob8AVVfOrR+RP8BUV86tH7E/QNQYKXD3vT8AKu1BmWfFP8DpTW9608s/wFRXzq0f0T+wtAdlnlXUP5AUuPuOi9c/cHRokn/B2j9g1BgpcPfdPyCa5F+wluA/EMo8q6gx4j8A+pT2oMzjP/gp7UGZZ+U/6FlFjZEC5z/YiZ3YiZ3oP9C59SOCOOo/wOlNb3rT6z+wGaa6cm7tP6BJ/gVrCe8/zDyrqDFS8D/EVFfOrR/xP7xsA/Qp7fE/uISvGaa68j+wnFs/IojzP6i0B2WeVfQ/oMyzihoj9T+c5F+wlvD1P5T8C9YSvvY/jBS4+46L9z+ILGQhC1n4P4BEEEeHJvk/eFy8bAP0+T9wdGiSf8H6P2yMFLj7jvs/ZKTA3Xdc/D9cvGwD9Cn9P1jUGClw9/0/UOzETuzE/j9IBHF0aJL/PyCODk3yLwBAHprkX7CWAEAaprpybv0AQBaykIUsZAFAFL5mmOrKAUAQyjyrqDECQAzWEr5mmAJACOLo0CT/AkAG7r7j4mUDQAL6lPagzANA/gVrCV8zBED8EUEcHZoEQPgdFy/bAAVA9CntQZlnBUDyNcNUV84FQO5BmWcVNQZA6k1vetObBkDmWUWNkQIHQORlG6BPaQdA4HHxsg3QB0DcfcfFyzYIQNiJndiJnQhA1JVz60cECUDUoUn+BWsJQNCtHxHE0QlAzLn1I4I4CkDIxcs2QJ8KQMTRoUn+BQtAwN13XLxsC0C86U1vetMLQLz1I4I4OgxAuAH6lPagDEC0DdCntAcNQLAZprpybg1ArCV8zTDVDUCoMVLg7jsOQKQ9KPOsog5ApEn+BWsJD0CgVdQYKXAPQJxhqivn1g9AzDZAn9IeEEDKPKuoMVIQQMhCFrKQhRBAxkiBu++4EEDGTuzETuwQQMRUV86tHxFAwlrC1wxTEUDAYC3ha4YRQL5mmOrKuRFAvGwD9CntEUC6cm79iCASQLp42QboUxJAuH5EEEeHEkC2hK8ZproSQLSKGiMF7hJAspCFLGQhE0CwlvA1w1QTQK6cWz8iiBNArqLGSIG7E0CsqDFS4O4TQKqunFs/IhRAqLQHZZ5VFECmunJu/YgUQKTA3XdcvBRAosZIgbvvFECizLOKGiMVQKDSHpR5VhVAntiJndiJFUCc3vSmN70VQJrkX7CW8BVAmOrKufUjFkCW8DXDVFcWQJb2oMyzihZAlPwL1hK+FkCSAnffcfEWQJAI4ujQJBdAjg5N8i9YF0CMFLj7josXQIwaIwXuvhdAiiCODk3yF0CIJvkXrCUYQIYsZCELWRhAhDLPKmqMGECCODo0yb8YQIA+pT0o8xhAgEQQR4cmGUB+SntQ5lkZQHxQ5llFjRlAelZRY6TAGUB4XLxsA/QZQHZiJ3ZiJxpAdGiSf8FaGkB0bv2III4aQHJ0aJJ/wRpAcHrTm970GkBugD6lPSgbQGyGqa6cWxtAaowUuPuOG0Bokn/BWsIbQGiY6sq59RtAZp5V1BgpHEBkpMDdd1wcQGKqK+fWjxxAYLCW8DXDHEBetgH6lPYcQFy8bAP0KR1AXMLXDFNdHUBayEIWspAdQFjOrR8RxB1AVtQYKXD3HUBU2oMyzyoeQFLg7jsuXh5AUOZZRY2RHkBQ7MRO7MQeQE7yL1hL+B5ATPiaYaorH0BK/gVrCV8fQEgEcXRokh9ARgrcfcfFH0BEEEeHJvkfQCILWchCFiBAIY4OTfIvIEAgEcTRoUkgQB+UeVZRYyBAHhcv2wB9IEAdmuRfsJYgQBwdmuRfsCBAHKBPaQ/KIEAbIwXuvuMgQBqmunJu/SBAGSlw9x0XIUAYrCV8zTAhQBcv2wB9SiFAFrKQhSxkIUAWNUYK3H0hQBW4+46LlyFAFDuxEzuxIUATvmaY6sohQBJBHB2a5CFAEcTRoUn+IUAQR4cm+RciQBDKPKuoMSJAD03yL1hLIkAO0Ke0B2UiQA1TXTm3fiJADNYSvmaYIkALWchCFrIiQArcfcfFyyJACl8zTHXlIkAJ4ujQJP8iQAhlnlXUGCNAB+hT2oMyI0AGawlfM0wjQAXuvuPiZSNABHF0aJJ/I0AE9CntQZkjQAN333HxsiNAAvqU9qDMI0ABfUp7UOYjQAAAAAAAACRA\",\"dtype\":\"float64\",\"shape\":[300]},\"y\":{\"__ndarray__\":\"KszCjsuQ0b+0eeIxuSbRvzonAtWmvNC/xNQheJRS0L+cBIM2BNHPv7Bfwnzf/M6/wLoBw7oozr/UFUEJllTNv+RwgE9xgMy/9Mu/lUysy78IJ//bJ9jKvxyCPiIDBMq/LN19aN4vyb9AOL2uuVvIv1CT/PSUh8e/ZO47O3Czxr90SXuBS9/Fv4ikuscmC8W/nP/5DQI3xL+sWjlU3WLDv7y1eJq4jsK/0BC44JO6wb/ka/cmb+bAv/TGNm1KEsC/DETsZkt8vr80+mrzAdS8v1iw6X+4K7u/eGZoDG+Dub+gHOeYJdu3v8TSZSXcMra/5IjksZKKtL8MP2M+SeKyvzT14cr/ObG/qFbBrmwjr7/wwr7H2dKrv0AvvOBGgqi/gJu5+bMxpb/IB7cSIeGhvzDoaFccIZ2/wMBjifZ/lr+gMr12ob2Pv8DjstpVe4K/gFOi+ijkZL8AdMO6ghJwP8AIbPmMS4U/4Cu7SuxGkT9QU8AYEuiXP7B6xeY3iZ4/GFFl2i6Voj/I5GfBweWlP4B4aqhUNqk/OAxtj+eGrD/wn292etevP9QZua4GlLE/sGM6IlA8sz+MrbuVmeS0P2T3PAnjjLY/REG+fCw1uD8giz/wdd25P/jUwGO/hbs/2B5C1wguvT+waMNKUta+P0dZIt9NP8A/M/7imHITwT8io6NSl+fBPxFIZAy8u8I//ewkxuCPwz/skeV/BWTEP9k2pjkqOMU/x9tm804Mxj+0gCetc+DGP6Ml6GaYtMc/kcqoIL2IyD9+b2na4VzJP20UKpQGMco/WbnqTSsFyz9IXqsHUNnLPzcDbMF0rcw/I6gse5mBzT8STe00vlXOP//xre7iKc8/7JZuqAf+zz/unRcxFmnQP2Tw940o09A/20LY6jo90T9TlbhHTafRP8rnmKRfEdI/QDp5AXJ70j+2jFlehOXSPy7fObuWT9M/pTEaGKm50z8bhPp0uyPUP5TW2tHNjdQ/Cim7LuD31D+Ae5uL8mHVP/bNe+gEzNU/byBcRRc21j/lcjyiKaDWP1vFHP87Ctc/1Bf9W0501z9Kat24YN7XP8C8vRVzSNg/Nw+ecoWy2D+vYX7PlxzZPyW0Xiyqhtk/nAY/ibzw2T8UWR/mzlraP4qr/0LhxNo/Af7fn/Mu2z95UMD8BZnbP/CioFkYA9w/ZvWAtipt3D/cR2ETPdfcP1SaQXBPQd0/y+whzWGr3T9BPwIqdBXeP7qR4oaGf94/MOTC45jp3j+mNqNAq1PfPxyJg529vd8/yu0x/ecT4D8GF6Ir8UjgP0FAElr6feA/fWmCiAOz4D+4kvK2DOjgP/O7YuUVHeE/LuXSEx9S4T9qDkNCKIfhP6Y3s3AxvOE/4WAjnzrx4T8dipPNQybiP1izA/xMW+I/lNxzKlaQ4j/OBeRYX8XiPwsvVIdo+uI/RljEtXEv4z+BgTTkemTjP72qpBKEmeM/+NMUQY3O4z80/YRvlgPkP28m9Z2fOOQ/q09lzKht5D/meNX6saLkPyKiRSm71+Q/Xsu1V8QM5T+Y9CWGzUHlP9QdlrTWduU/D0cG49+r5T9LcHYR6eDlP4aZ5j/yFeY/wsJWbvtK5j/+68acBIDmPzkVN8sNteY/dD6n+Rbq5j+wZxcoIB/nP+yQh1YpVOc/Jrr3hDKJ5z9i42ezO77nP54M2OFE8+c/2TVIEE4o6D8UX7g+V13oP1CIKG1gkug/irGYm2nH6D/I2gjKcvzoPwMEefh7Mek/Pi3pJoVm6T95VllVjpvpP7R/yYOX0Ok/8Kg5sqAF6j8q0qngqTrqP2j7GQ+zb+o/oySKPbyk6j/eTfprxdnqPxp3aprODus/VKDayNdD6z+QyUr34HjrP8vyuiXqres/CBwrVPPi6z9DRZuC/BfsP35uC7EFTew/upd73w6C7D/1wOsNGLfsPzDqWzwh7Ow/axPMaioh7T+oPDyZM1btP+NlrMc8i+0/H48c9kXA7T9auIwkT/XtP5Xh/FJYKu4/0AptgWFf7j8LNN2vapTuP0hdTd5zye4/hIa9DH3+7j+/ry07hjPvP/rYnWmPaO8/NQIOmJid7z9wK37GodLvP1Yqd3rVA/A/9D6vEVoe8D+SU+eo3jjwPzBoH0BjU/A/zXxX1+dt8D9qkY9ubIjwPwimxwXxovA/prr/nHW98D9Ezzc0+tfwP+Ljb8t+8vA/gPinYgMN8T8dDeD5hyfxP7ohGJEMQvE/WDZQKJFc8T/2Soi/FXfxP5RfwFaakfE/MnT47R6s8T/QiDCFo8bxP22daBwo4fE/C7Kgs6z78T+oxthKMRbyP0fbEOK1MPI/5O9IeTpL8j+CBIEQv2XyPyAZuadDgPI/vi3xPsia8j9bQinWTLXyP/hWYW3Rz/I/l2uZBFbq8j80gNGb2gTzP9KUCTNfH/M/cKlByuM58z8OvnlhaFTzP6vSsfjsbvM/SOfpj3GJ8z/n+yEn9qPzP4QQWr56vvM/IiWSVf/Y8z/AOcrsg/PzP15OAoQIDvQ/+2I6G40o9D+Yd3KyEUP0PzeMqkmWXfQ/1aDi4Bp49D9ytRp4n5L0PxDKUg8krfQ/rt6KpqjH9D9L88I9LeL0P+gH+9Sx/PQ/iBwzbDYX9T8lMWsDuzH1P8JFo5o/TPU/YFrbMcRm9T/+bhPJSIH1P5uDS2DNm/U/OZiD91G29T/YrLuO1tD1P3XB8yVb6/U/EtYrvd8F9j+w6mNUZCD2P07/m+voOvY/7BPUgm1V9j+IKAwa8m/2Pyg9RLF2ivY/xlF8SPuk9j9iZrTff7/2PwB77HYE2vY/no8kDon09j88pFylDQ/3P9i4lDySKfc/eM3M0xZE9z8W4gRrm173P7L2PAIgefc/UAt1maST9z/uH60wKa73P4w05cetyPc/KkkdXzLj9z/IXVX2tv33P2ZyjY07GPg/AofFJMAy+D+gm/27RE34Pz6wNVPJZ/g/3MRt6k2C+D962aWB0pz4Pxju3RhXt/g/tgIWsNvR+D9SF05HYOz4P/Arht7kBvk/jkC+dWkh+T8sVfYM7jv5P8ppLqRyVvk/aH5mO/dw+T8Gk57Se4v5P6Sn1mkApvk/QLwOAYXA+T/e0EaYCdv5P3zlfi+O9fk/Gvq2xhIQ+j+4Du9dlyr6P1YjJ/UbRfo/9DdfjKBf+j+QTJcjJXr6Py5hz7qplPo/\",\"dtype\":\"float64\",\"shape\":[300]}}},\"id\":\"41795a44-b44b-46f6-9b05-5889544c4b75\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":3,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"d8816153-d1a5-4972-b632-e9500c39bc80\",\"type\":\"Line\"},{\"attributes\":{\"callback\":null,\"column_names\":[\"x\",\"y\"],\"data\":{\"x\":{\"__ndarray__\":\"C5Ea8w98HkC4KVqjDCTzP5RBJk88RQxA0DmqpD+CIkDSRvpOInkcQBocLhZvOtm/1bM9L/9oDUC8+bD0uQnYvwLmfcmBUN4/UqysT0CxAEASry3H3S7yP7W7UIwKWxhASKEks0hJBkD4iyV9nPTwP5nhAcPuHvs/Z1x0hfo0+T9aylhivgoZQMtoX/Y3G9C/V+++x+H79j/zBXNRVnriv4mQ0k4/hQPAH1mNwPhGAkBor33psmAMQF32GwMuCNa/Q1W+l8W9IUCRHN/l8/z3vyxBjia136o/qrwcTq6G1z9lGZToylQZQIc6tnGq/xdAfOXvzlfi2D8orlWyccr6PxO7kor9cPG/dzcQnDqK/7/KFBpswDzhv56ghT6lgeo/p21NOmZgFEACxSJf9fwSQHe3aqiCL9G/PeoeqMUq5L/PL+SMVO/3v+jC/Ax4nPS/Yqj1BROA+r802kUqdfkfQJjN+NP7Rso/7mQQIzXOw7/gktkwIG34vxqk5+x+jwtAqkYxncERAMCimypBUXrWv3JZPWVfT+2/D6gY2y5+AEB/QTg8037nvxTogahl2/a/2ManOTaNrb+BdQnbgTr4P8JP4O90VOM/xOg8BlxZ7D/UeIKzYlDvv9I2PNUfn9+/7ByvssJM6r/ABCrhOxLMP+BPr23/5+C/GwVSREUz+7+0La77ieTVPzJMJcUhAcO/Ti2AZ1vi/r8Mgwd/AjT2P1Ba6dzioOG/InaXOSFi0z9Mr8lMfIoJQJsBcC3RkeM/ag4GHTRCE0DQzcSp0+H2vxevA30WyfQ/VuDAUcG93r8j+7DRA8rxvzMX2fCSJOm/W9F7koKv3L8yBIzmh23NPwVj6gNxV/S/dKr97cyGCUDIcZH9iLb6PyYVFTVJoAHAO1kCvPOPGED2s7HyaGkcQBQRT7oDiRFAMv1cMvkGxb8jgEXnk670v6OBI2EMuRJA1HWukyZQ6b9qcwtMLOETQBMq+Yg2SOo/9bKaSfNyDEDjcQKctVH5P48Pjm3ZMgZANHbdX6KB0T9/vt7w548dQG6TcxaDgfI/IfaZd0ISAEA=\",\"dtype\":\"float64\",\"shape\":[100]},\"y\":{\"__ndarray__\":\"AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAAAAAAAAAAAA8D8AAAAAAAAAAAAAAAAAAAAAAAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAAAAAAAAAAAA8D8AAAAAAAAAAAAAAAAAAAAAAAAAAAAA8D8AAAAAAADwPwAAAAAAAAAAAAAAAAAA8D8AAAAAAAAAAAAAAAAAAPA/AAAAAAAAAAAAAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADwPwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPA/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA8D8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADwPwAAAAAAAAAAAAAAAAAAAAAAAAAAAADwPwAAAAAAAAAAAAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAAAAAAAAAAAAAPA/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADwPwAAAAAAAAAAAAAAAAAA8D8AAAAAAADwPwAAAAAAAAAAAAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAPA/AAAAAAAAAAAAAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D8=\",\"dtype\":\"float64\",\"shape\":[100]}}},\"id\":\"be16d460-c01a-410a-903d-0e3fb2c1855e\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"below\":[{\"id\":\"98c0353c-dd6e-4e4a-ba97-f00210a483fe\",\"type\":\"LinearAxis\"}],\"left\":[{\"id\":\"53a20604-6775-41be-a532-8161544a12f5\",\"type\":\"LinearAxis\"}],\"plot_height\":300,\"plot_width\":630,\"renderers\":[{\"id\":\"98c0353c-dd6e-4e4a-ba97-f00210a483fe\",\"type\":\"LinearAxis\"},{\"id\":\"c2db0362-7d07-4745-92b5-7f15ddf99c25\",\"type\":\"Grid\"},{\"id\":\"53a20604-6775-41be-a532-8161544a12f5\",\"type\":\"LinearAxis\"},{\"id\":\"84c8811b-772a-457c-ab0f-8f35f47714de\",\"type\":\"Grid\"},{\"id\":\"4b0f83ac-7f27-421f-b48e-c2890c0721a1\",\"type\":\"BoxAnnotation\"},{\"id\":\"bc64fadb-5d78-4c40-9552-190aa8c917e6\",\"type\":\"GlyphRenderer\"},{\"id\":\"010cac3e-a766-41ef-a97c-b6ea31a8b845\",\"type\":\"Legend\"},{\"id\":\"d4aa6622-04a4-4e55-9e5e-eef8d3de727f\",\"type\":\"GlyphRenderer\"},{\"id\":\"8a71e5f0-fe5b-4daa-b260-bdd79a9b69ce\",\"type\":\"GlyphRenderer\"}],\"title\":null,\"toolbar\":{\"id\":\"4a8a4238-eac5-4933-9a3c-12d28ec32661\",\"type\":\"Toolbar\"},\"x_range\":{\"id\":\"a76608d9-67e8-4fd0-9f3d-66f8e03831f8\",\"type\":\"Range1d\"},\"x_scale\":{\"id\":\"34cc3348-e357-40ac-8f76-94af471b0594\",\"type\":\"LinearScale\"},\"y_range\":{\"id\":\"6e55b303-1ae0-4e7c-97ad-da458be37afb\",\"type\":\"Range1d\"},\"y_scale\":{\"id\":\"a5d29820-8616-42a3-9c0c-c1e52be01db9\",\"type\":\"LinearScale\"}},\"id\":\"9a5a26cf-56fe-43b9-add6-727a968979a3\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"ticks\":[0,0.5,1]},\"id\":\"5c995ec2-0619-43cc-9624-27c2be5852e0\",\"type\":\"FixedTicker\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"c9619fef-02f2-491e-8d8f-5b7c274410b6\",\"type\":\"Circle\"},{\"attributes\":{\"data_source\":{\"id\":\"41795a44-b44b-46f6-9b05-5889544c4b75\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"10eab1f1-328b-4038-9d6a-1924f7d9220c\",\"type\":\"Line\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"d8816153-d1a5-4972-b632-e9500c39bc80\",\"type\":\"Line\"},\"selection_glyph\":null,\"view\":{\"id\":\"937c8615-bb0f-4e23-a701-c6af39e7d182\",\"type\":\"CDSView\"}},\"id\":\"8a71e5f0-fe5b-4daa-b260-bdd79a9b69ce\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"34cc3348-e357-40ac-8f76-94af471b0594\",\"type\":\"LinearScale\"},{\"attributes\":{\"source\":{\"id\":\"41795a44-b44b-46f6-9b05-5889544c4b75\",\"type\":\"ColumnDataSource\"}},\"id\":\"937c8615-bb0f-4e23-a701-c6af39e7d182\",\"type\":\"CDSView\"},{\"attributes\":{\"callback\":null,\"end\":1.25,\"start\":-0.25},\"id\":\"6e55b303-1ae0-4e7c-97ad-da458be37afb\",\"type\":\"Range1d\"},{\"attributes\":{},\"id\":\"a5d29820-8616-42a3-9c0c-c1e52be01db9\",\"type\":\"LinearScale\"},{\"attributes\":{\"grid_line_color\":{\"value\":null},\"plot\":{\"id\":\"9a5a26cf-56fe-43b9-add6-727a968979a3\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"0e57ef1d-3ab6-4555-bfdb-273218a7f88a\",\"type\":\"BasicTicker\"}},\"id\":\"c2db0362-7d07-4745-92b5-7f15ddf99c25\",\"type\":\"Grid\"},{\"attributes\":{\"formatter\":{\"id\":\"06bc065d-53af-41fb-a429-0552f14fae3f\",\"type\":\"BasicTickFormatter\"},\"minor_tick_out\":0,\"plot\":{\"id\":\"9a5a26cf-56fe-43b9-add6-727a968979a3\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"0e57ef1d-3ab6-4555-bfdb-273218a7f88a\",\"type\":\"BasicTicker\"}},\"id\":\"98c0353c-dd6e-4e4a-ba97-f00210a483fe\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"0e57ef1d-3ab6-4555-bfdb-273218a7f88a\",\"type\":\"BasicTicker\"},{\"attributes\":{\"formatter\":{\"id\":\"6b9a559a-8614-4acf-986d-b60ff77bfcee\",\"type\":\"BasicTickFormatter\"},\"minor_tick_out\":0,\"plot\":{\"id\":\"9a5a26cf-56fe-43b9-add6-727a968979a3\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"5c995ec2-0619-43cc-9624-27c2be5852e0\",\"type\":\"FixedTicker\"}},\"id\":\"53a20604-6775-41be-a532-8161544a12f5\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"40aa7a62-c6d7-469b-a50e-943f276f7ac5\",\"type\":\"BasicTicker\"},{\"attributes\":{\"dimension\":1,\"grid_line_color\":{\"value\":null},\"plot\":{\"id\":\"9a5a26cf-56fe-43b9-add6-727a968979a3\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"40aa7a62-c6d7-469b-a50e-943f276f7ac5\",\"type\":\"BasicTicker\"}},\"id\":\"84c8811b-772a-457c-ab0f-8f35f47714de\",\"type\":\"Grid\"},{\"attributes\":{\"data_source\":{\"id\":\"be16d460-c01a-410a-903d-0e3fb2c1855e\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"73991623-91e1-4105-82a6-37a8132ab4ac\",\"type\":\"Circle\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"c9619fef-02f2-491e-8d8f-5b7c274410b6\",\"type\":\"Circle\"},\"selection_glyph\":null,\"view\":{\"id\":\"260a0f26-1eef-403d-8183-6969b5f7923c\",\"type\":\"CDSView\"}},\"id\":\"bc64fadb-5d78-4c40-9552-190aa8c917e6\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"fill_color\":{\"value\":\"black\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"73991623-91e1-4105-82a6-37a8132ab4ac\",\"type\":\"Circle\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"lightgrey\"},\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":{\"value\":1.0},\"line_color\":{\"value\":\"black\"},\"line_dash\":[4,4],\"line_width\":{\"value\":2},\"plot\":null,\"render_mode\":\"css\",\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"4b0f83ac-7f27-421f-b48e-c2890c0721a1\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"f6a25900-dd02-4a32-a63a-7320ef257f30\",\"type\":\"PanTool\"},{\"attributes\":{\"label\":{\"value\":\"Logistic Regression Model\"},\"renderers\":[{\"id\":\"d4aa6622-04a4-4e55-9e5e-eef8d3de727f\",\"type\":\"GlyphRenderer\"}]},\"id\":\"19f72b6d-984b-4bed-8e55-1e9e4611b8fe\",\"type\":\"LegendItem\"},{\"attributes\":{},\"id\":\"0f61e146-92b6-4d80-8b6a-6c0193cca2a4\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"overlay\":{\"id\":\"4b0f83ac-7f27-421f-b48e-c2890c0721a1\",\"type\":\"BoxAnnotation\"}},\"id\":\"b592f1d6-ff6e-456d-8a1b-abc072d69893\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"611e7801-7ded-4afe-85cb-8625da372170\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"5ed5ea00-0730-43b9-b232-26b9558ded86\",\"type\":\"ResetTool\"}],\"root_ids\":[\"9a5a26cf-56fe-43b9-add6-727a968979a3\"]},\"title\":\"Bokeh Application\",\"version\":\"0.12.13\"}};\n", " var render_items = [{\"docid\":\"9f0a6e9d-b850-4370-9a57-3c308b74d32a\",\"elementid\":\"4a0b1690-7c06-453c-80c6-97a41fb54e2e\",\"modelid\":\"9a5a26cf-56fe-43b9-add6-727a968979a3\"}];\n", " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", "\n", " }\n", " if (root.Bokeh !== undefined) {\n", " embed_document(root);\n", " } else {\n", " var attempts = 0;\n", " var timer = setInterval(function(root) {\n", " if (root.Bokeh !== undefined) {\n", " embed_document(root);\n", " clearInterval(timer);\n", " }\n", " attempts++;\n", " if (attempts > 100) {\n", " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\")\n", " clearInterval(timer);\n", " }\n", " }, 10, root)\n", " }\n", "})(window);" ], "application/vnd.bokehjs_exec.v0+json": "" }, "metadata": { "application/vnd.bokehjs_exec.v0+json": { "id": "9a5a26cf-56fe-43b9-add6-727a968979a3" } }, "output_type": "display_data" } ], "source": [ "# this is our test set, it's just a straight line with some\n", "# Gaussian noise\n", "xmin, xmax = -5, 5\n", "n_samples = 100\n", "np.random.seed(0)\n", "X = np.random.normal(size=n_samples)\n", "y = (X > 0).astype(np.float)\n", "X[X > 0] *= 4\n", "X += .3 * np.random.normal(size=n_samples)\n", "\n", "X = X[:, np.newaxis]\n", "# run the classifier\n", "clf = linear_model.LogisticRegression(C=1e5)\n", "clf.fit(X, y)\n", "\n", "# and plot the result\n", "fig = bk.figure(plot_width=630, plot_height=300, title=None)\n", "fig.circle(X.ravel(), y, color='black')\n", "X_test = np.linspace(-5, 10, 300)\n", "\n", "def model(x):\n", " return 1 / (1 + np.exp(-x))\n", "\n", "loss = model(X_test * clf.coef_ + clf.intercept_).ravel()\n", "fig.line(X_test, loss, color='red', line_width=3, \n", " legend='Logistic Regression Model')\n", "\n", "ols = linear_model.LinearRegression()\n", "ols.fit(X, y)\n", "\n", "fig.line(X_test, ols.coef_ * X_test + ols.intercept_, line_width=3, \n", " legend='Linear Regression Model')\n", "\n", "fig.x_range=Range1d(-4, 10)\n", "fig.y_range=Range1d(-0.25, 1.25)\n", "fig.grid.grid_line_color = None\n", "fig.axis.minor_tick_out = 0\n", "fig.yaxis[0].ticker=FixedTicker(ticks=[0, 0.5, 1])\n", "fig.legend.location = \"bottom_right\"\n", "bk.show(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.4 Multiclass Classification" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "So far we have assumed that the logistic regression can calssfify problems with an outcome consisting in two classes. It is possible to extend to a multiclass setting, that is, a problem where the discrete output variable can assume more than two values. \n", "\n", "One simple extension to the basic binary classification is to use a one-vs-all approach, that is turn the problem in three separate problems, each of which is binary." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Train a logistic regression classifier for each class:\n", "\n", "$$h_i(x) = P(y=i \\mid x, \\beta) \\quad (i=1,2,3)$$\n", "\n", "on new input $x$ output the class that maximizes the probability \n", "\n", "$$\\underset{i}{max} \\, h_i(x)$$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "\n", "Visit [www.add-for.com]() for more tutorials and updates.\n", "\n", "This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License." ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda env:addfor_tutorials]", "language": "python", "name": "conda-env-addfor_tutorials-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 1 }