{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Lineare Regression\n", "\n", "This notebook implements linear regression using gradient descent as taught in Week 1 of [Coursera's Machine Learning course](https://www.coursera.org/learn/machine-learning/#syllabus).\n", "\n", "The course doesn't explicitly go into the implementation yet, so I'm not sure how to pick a good alpha (learning rate) nor do I know a good way to determine the number of iterations the algorithm should run.\n", "\n", "I decided to attempt a quick implementation in Python anyway in order to improve my understanding." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
xy
001
113
225
337
449
5511
\n", "
" ], "text/plain": [ " x y\n", "0 0 1\n", "1 1 3\n", "2 2 5\n", "3 3 7\n", "4 4 9\n", "5 5 11" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [1, 3, 5, 7, 9, 11]})\n", "df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def hypothesis(theta0, theta1, x):\n", " return theta0 + theta1 * x\n", "\n", "def linear_regression(df, alpha, iterations):\n", " theta0 = 0\n", " theta1 = 0\n", " m = len(df)\n", " for _ in range(0, iterations):\n", " newTheta0 = 0\n", " newTheta1 = 0\n", " for i in range (0, m):\n", " error = hypothesis(theta0, theta1, df.x[i]) - df.y[i]\n", " newTheta0 = newTheta0 + error\n", " newTheta1 = newTheta1 + error * df.x[i]\n", " newTheta0 = theta0 - alpha / m * newTheta0\n", " newTheta1 = theta1 - alpha / m * newTheta1\n", " theta0 = newTheta0\n", " theta1 = newTheta1\n", " return [theta0, theta1]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 212 ms, sys: 3.03 ms, total: 215 ms\n", "Wall time: 213 ms\n" ] }, { "data": { "text/plain": [ "[0.99999999999996192, 2.0000000000000107]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "linear_regression(df, 0.1, 1000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The implementation above can be made more efficient by replacing the inner loop with matrix operations instead.\n", "\n", "Below is my attempt at doing so:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def vector_linear_regression(df, alpha, iterations):\n", " m = len(df)\n", " thetas = np.zeros((2,1))\n", " X = pd.DataFrame(data={0: 1, 1: df.x}).as_matrix() # Is there a more elegant way to build this matrix?\n", " y = np.array([df.y]).transpose()\n", " op = X.transpose()\n", " for _ in range(0, iterations):\n", " errors = X.dot(thetas) - y\n", " sums = op.dot(errors)\n", " thetas = thetas - sums * alpha / m\n", " return [thetas[0][0], thetas[1][0]]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 66.5 ms, sys: 3.47 ms, total: 69.9 ms\n", "Wall time: 69.2 ms\n" ] }, { "data": { "text/plain": [ "[0.99999999999996181, 2.0000000000000107]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "vector_linear_regression(df, 0.1, 1000)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1],\n", " [ 1, 3],\n", " [ 2, 5],\n", " [ 3, 7],\n", " [ 4, 9],\n", " [ 5, 11]])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.as_matrix()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }