{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.19.1\n",
      "0.20.3\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import sklearn\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import os\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "print sklearn.__version__\n",
    "print pd.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>symboling</th>\n",
       "      <th>normalized-losses</th>\n",
       "      <th>make</th>\n",
       "      <th>fuel-type</th>\n",
       "      <th>aspiration</th>\n",
       "      <th>num-of-doors</th>\n",
       "      <th>body-style</th>\n",
       "      <th>drive-wheels</th>\n",
       "      <th>engine-location</th>\n",
       "      <th>wheel-base</th>\n",
       "      <th>...</th>\n",
       "      <th>engine-size</th>\n",
       "      <th>fuel-system</th>\n",
       "      <th>bore</th>\n",
       "      <th>stroke</th>\n",
       "      <th>compression-ratio</th>\n",
       "      <th>horsepower</th>\n",
       "      <th>peak-rpm</th>\n",
       "      <th>city-mpg</th>\n",
       "      <th>highway-mpg</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>164</td>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>four</td>\n",
       "      <td>sedan</td>\n",
       "      <td>fwd</td>\n",
       "      <td>front</td>\n",
       "      <td>99.8</td>\n",
       "      <td>...</td>\n",
       "      <td>109</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>3.19</td>\n",
       "      <td>3.4</td>\n",
       "      <td>10.0</td>\n",
       "      <td>102</td>\n",
       "      <td>5500</td>\n",
       "      <td>24</td>\n",
       "      <td>30</td>\n",
       "      <td>13950</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>164</td>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>four</td>\n",
       "      <td>sedan</td>\n",
       "      <td>4wd</td>\n",
       "      <td>front</td>\n",
       "      <td>99.4</td>\n",
       "      <td>...</td>\n",
       "      <td>136</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>3.19</td>\n",
       "      <td>3.4</td>\n",
       "      <td>8.0</td>\n",
       "      <td>115</td>\n",
       "      <td>5500</td>\n",
       "      <td>18</td>\n",
       "      <td>22</td>\n",
       "      <td>17450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>158</td>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>four</td>\n",
       "      <td>sedan</td>\n",
       "      <td>fwd</td>\n",
       "      <td>front</td>\n",
       "      <td>105.8</td>\n",
       "      <td>...</td>\n",
       "      <td>136</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>3.19</td>\n",
       "      <td>3.4</td>\n",
       "      <td>8.5</td>\n",
       "      <td>110</td>\n",
       "      <td>5500</td>\n",
       "      <td>19</td>\n",
       "      <td>25</td>\n",
       "      <td>17710</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>158</td>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>turbo</td>\n",
       "      <td>four</td>\n",
       "      <td>sedan</td>\n",
       "      <td>fwd</td>\n",
       "      <td>front</td>\n",
       "      <td>105.8</td>\n",
       "      <td>...</td>\n",
       "      <td>131</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>3.13</td>\n",
       "      <td>3.4</td>\n",
       "      <td>8.3</td>\n",
       "      <td>140</td>\n",
       "      <td>5500</td>\n",
       "      <td>17</td>\n",
       "      <td>20</td>\n",
       "      <td>23875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>192</td>\n",
       "      <td>bmw</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>two</td>\n",
       "      <td>sedan</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>101.2</td>\n",
       "      <td>...</td>\n",
       "      <td>108</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>3.50</td>\n",
       "      <td>2.8</td>\n",
       "      <td>8.8</td>\n",
       "      <td>101</td>\n",
       "      <td>5800</td>\n",
       "      <td>23</td>\n",
       "      <td>29</td>\n",
       "      <td>16430</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 26 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   symboling  normalized-losses  make fuel-type aspiration num-of-doors  \\\n",
       "0          2                164  audi       gas        std         four   \n",
       "1          2                164  audi       gas        std         four   \n",
       "2          1                158  audi       gas        std         four   \n",
       "3          1                158  audi       gas      turbo         four   \n",
       "4          2                192   bmw       gas        std          two   \n",
       "\n",
       "  body-style drive-wheels engine-location  wheel-base  ...    engine-size  \\\n",
       "0      sedan          fwd           front        99.8  ...            109   \n",
       "1      sedan          4wd           front        99.4  ...            136   \n",
       "2      sedan          fwd           front       105.8  ...            136   \n",
       "3      sedan          fwd           front       105.8  ...            131   \n",
       "4      sedan          rwd           front       101.2  ...            108   \n",
       "\n",
       "   fuel-system  bore  stroke compression-ratio horsepower  peak-rpm city-mpg  \\\n",
       "0         mpfi  3.19     3.4              10.0        102      5500       24   \n",
       "1         mpfi  3.19     3.4               8.0        115      5500       18   \n",
       "2         mpfi  3.19     3.4               8.5        110      5500       19   \n",
       "3         mpfi  3.13     3.4               8.3        140      5500       17   \n",
       "4         mpfi  3.50     2.8               8.8        101      5800       23   \n",
       "\n",
       "   highway-mpg  price  \n",
       "0           30  13950  \n",
       "1           22  17450  \n",
       "2           25  17710  \n",
       "3           20  23875  \n",
       "4           29  16430  \n",
       "\n",
       "[5 rows x 26 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "DATA_DIR = '../data'\n",
    "df = pd.read_table(\n",
    "            os.path.abspath(os.path.join(DATA_DIR, 'day2/automobile.csv')),\n",
    "            sep=','\n",
    "            \n",
    ")\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(159, 26)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 205 rows, 26 cols\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "symboling              int64\n",
       "normalized-losses      int64\n",
       "make                  object\n",
       "fuel-type             object\n",
       "aspiration            object\n",
       "num-of-doors          object\n",
       "body-style            object\n",
       "drive-wheels          object\n",
       "engine-location       object\n",
       "wheel-base           float64\n",
       "length               float64\n",
       "width                float64\n",
       "height               float64\n",
       "curb-weight            int64\n",
       "engine-type           object\n",
       "num-of-cylinders      object\n",
       "engine-size            int64\n",
       "fuel-system           object\n",
       "bore                 float64\n",
       "stroke               float64\n",
       "compression-ratio    float64\n",
       "horsepower             int64\n",
       "peak-rpm               int64\n",
       "city-mpg               int64\n",
       "highway-mpg            int64\n",
       "price                  int64\n",
       "dtype: object"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# datatypes\n",
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Experiment 1\n",
    "\n",
    "For the first experiment we will just use numerical features as our features for prediction\n",
    "\n",
    "So, to summarize\n",
    "\n",
    "__Input__: Numerical Values   \n",
    "__Output__: Price\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>symboling</th>\n",
       "      <th>normalized-losses</th>\n",
       "      <th>wheel-base</th>\n",
       "      <th>length</th>\n",
       "      <th>width</th>\n",
       "      <th>height</th>\n",
       "      <th>curb-weight</th>\n",
       "      <th>engine-size</th>\n",
       "      <th>bore</th>\n",
       "      <th>stroke</th>\n",
       "      <th>compression-ratio</th>\n",
       "      <th>horsepower</th>\n",
       "      <th>peak-rpm</th>\n",
       "      <th>city-mpg</th>\n",
       "      <th>highway-mpg</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>164</td>\n",
       "      <td>99.8</td>\n",
       "      <td>176.6</td>\n",
       "      <td>66.2</td>\n",
       "      <td>54.3</td>\n",
       "      <td>2337</td>\n",
       "      <td>109</td>\n",
       "      <td>3.19</td>\n",
       "      <td>3.4</td>\n",
       "      <td>10.0</td>\n",
       "      <td>102</td>\n",
       "      <td>5500</td>\n",
       "      <td>24</td>\n",
       "      <td>30</td>\n",
       "      <td>13950</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>164</td>\n",
       "      <td>99.4</td>\n",
       "      <td>176.6</td>\n",
       "      <td>66.4</td>\n",
       "      <td>54.3</td>\n",
       "      <td>2824</td>\n",
       "      <td>136</td>\n",
       "      <td>3.19</td>\n",
       "      <td>3.4</td>\n",
       "      <td>8.0</td>\n",
       "      <td>115</td>\n",
       "      <td>5500</td>\n",
       "      <td>18</td>\n",
       "      <td>22</td>\n",
       "      <td>17450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>158</td>\n",
       "      <td>105.8</td>\n",
       "      <td>192.7</td>\n",
       "      <td>71.4</td>\n",
       "      <td>55.7</td>\n",
       "      <td>2844</td>\n",
       "      <td>136</td>\n",
       "      <td>3.19</td>\n",
       "      <td>3.4</td>\n",
       "      <td>8.5</td>\n",
       "      <td>110</td>\n",
       "      <td>5500</td>\n",
       "      <td>19</td>\n",
       "      <td>25</td>\n",
       "      <td>17710</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>158</td>\n",
       "      <td>105.8</td>\n",
       "      <td>192.7</td>\n",
       "      <td>71.4</td>\n",
       "      <td>55.9</td>\n",
       "      <td>3086</td>\n",
       "      <td>131</td>\n",
       "      <td>3.13</td>\n",
       "      <td>3.4</td>\n",
       "      <td>8.3</td>\n",
       "      <td>140</td>\n",
       "      <td>5500</td>\n",
       "      <td>17</td>\n",
       "      <td>20</td>\n",
       "      <td>23875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>192</td>\n",
       "      <td>101.2</td>\n",
       "      <td>176.8</td>\n",
       "      <td>64.8</td>\n",
       "      <td>54.3</td>\n",
       "      <td>2395</td>\n",
       "      <td>108</td>\n",
       "      <td>3.50</td>\n",
       "      <td>2.8</td>\n",
       "      <td>8.8</td>\n",
       "      <td>101</td>\n",
       "      <td>5800</td>\n",
       "      <td>23</td>\n",
       "      <td>29</td>\n",
       "      <td>16430</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   symboling  normalized-losses  wheel-base  length  width  height  \\\n",
       "0          2                164        99.8   176.6   66.2    54.3   \n",
       "1          2                164        99.4   176.6   66.4    54.3   \n",
       "2          1                158       105.8   192.7   71.4    55.7   \n",
       "3          1                158       105.8   192.7   71.4    55.9   \n",
       "4          2                192       101.2   176.8   64.8    54.3   \n",
       "\n",
       "   curb-weight  engine-size  bore  stroke  compression-ratio  horsepower  \\\n",
       "0         2337          109  3.19     3.4               10.0         102   \n",
       "1         2824          136  3.19     3.4                8.0         115   \n",
       "2         2844          136  3.19     3.4                8.5         110   \n",
       "3         3086          131  3.13     3.4                8.3         140   \n",
       "4         2395          108  3.50     2.8                8.8         101   \n",
       "\n",
       "   peak-rpm  city-mpg  highway-mpg  price  \n",
       "0      5500        24           30  13950  \n",
       "1      5500        18           22  17450  \n",
       "2      5500        19           25  17710  \n",
       "3      5500        17           20  23875  \n",
       "4      5800        23           29  16430  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "numerics_dtypes = ['int64', 'float64']\n",
    "df_rel = df.select_dtypes(include=numerics_dtypes)\n",
    "df_rel.loc[:,'price'] = df.price\n",
    "df_rel.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(159, 16)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we have only 16 columns of 26 that are numeric\n",
    "df_rel.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Make input and output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(159, 15)\n",
      "(159,)\n"
     ]
    }
   ],
   "source": [
    "X = df_rel.iloc[ : , :-1].values\n",
    "Y = df_rel.iloc[:,-1].values\n",
    "\n",
    "print X.shape\n",
    "print Y.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train/Test Split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# ideal practice is to use test as 20% - 30% of training data\n",
    "# defined by test_size in train_test_split()\n",
    "# random_state is required to avoid sequential biasness in the data distribution\n",
    "def data_split(X, Y):\n",
    "    X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10)\n",
    "    return X_train, X_test, Y_train, Y_test\n",
    "\n",
    "X_train, X_test, Y_train, Y_test = data_split(X, Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((127, 15), (32, 15))"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape, X_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class Regression:\n",
    "    \n",
    "    def __init__(self):\n",
    "        self.regressor = LinearRegression()\n",
    "    \n",
    "    def train(self, X_train, Y_train):\n",
    "        model = self.regressor.fit(X_train, Y_train)\n",
    "        return model\n",
    "\n",
    "    def predict(self, model, X_test):\n",
    "        return model.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "regress = Regression()\n",
    "model = regress.train(X_train, Y_train)\n",
    "predictions_train = regress.predict(model, X_train)\n",
    "predictions_test = regress.predict(model, X_test)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}