{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Lineare Regression\n",
    "\n",
    "This notebook implements linear regression using gradient descent as taught in Week 1 of [Coursera's Machine Learning course](https://www.coursera.org/learn/machine-learning/#syllabus).\n",
    "\n",
    "The course doesn't explicitly go into the implementation yet, so I'm not sure how to pick a good alpha (learning rate) nor do I know a good way to determine the number of iterations the algorithm should run.\n",
    "\n",
    "I decided to attempt a quick implementation in Python anyway in order to improve my understanding."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   x   y\n",
       "0  0   1\n",
       "1  1   3\n",
       "2  2   5\n",
       "3  3   7\n",
       "4  4   9\n",
       "5  5  11"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [1, 3, 5, 7, 9, 11]})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def hypothesis(theta0, theta1, x):\n",
    "    return theta0 + theta1 * x\n",
    "\n",
    "def linear_regression(df, alpha, iterations):\n",
    "    theta0 = 0\n",
    "    theta1 = 0\n",
    "    m = len(df)\n",
    "    for _ in range(0, iterations):\n",
    "        newTheta0 = 0\n",
    "        newTheta1 = 0\n",
    "        for i in range (0, m):\n",
    "            error = hypothesis(theta0, theta1, df.x[i]) - df.y[i]\n",
    "            newTheta0 = newTheta0 + error\n",
    "            newTheta1 = newTheta1 + error * df.x[i]\n",
    "        newTheta0 = theta0 - alpha / m * newTheta0\n",
    "        newTheta1 = theta1 - alpha / m * newTheta1\n",
    "        theta0 = newTheta0\n",
    "        theta1 = newTheta1\n",
    "    return [theta0, theta1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 212 ms, sys: 3.03 ms, total: 215 ms\n",
      "Wall time: 213 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[0.99999999999996192, 2.0000000000000107]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "linear_regression(df, 0.1, 1000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The implementation above can be made more efficient by replacing the inner loop with matrix operations instead.\n",
    "\n",
    "Below is my attempt at doing so:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def vector_linear_regression(df, alpha, iterations):\n",
    "    m = len(df)\n",
    "    thetas = np.zeros((2,1))\n",
    "    X = pd.DataFrame(data={0: 1, 1: df.x}).as_matrix() # Is there a more elegant way to build this matrix?\n",
    "    y = np.array([df.y]).transpose()\n",
    "    op = X.transpose()\n",
    "    for _ in range(0, iterations):\n",
    "        errors = X.dot(thetas) - y\n",
    "        sums = op.dot(errors)\n",
    "        thetas = thetas - sums * alpha / m\n",
    "    return [thetas[0][0], thetas[1][0]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 66.5 ms, sys: 3.47 ms, total: 69.9 ms\n",
      "Wall time: 69.2 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[0.99999999999996181, 2.0000000000000107]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "vector_linear_regression(df, 0.1, 1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  1],\n",
       "       [ 1,  3],\n",
       "       [ 2,  5],\n",
       "       [ 3,  7],\n",
       "       [ 4,  9],\n",
       "       [ 5, 11]])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.as_matrix()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}