{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.19.1\n", "0.20.3\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", " \"This module will be removed in 0.20.\", DeprecationWarning)\n" ] } ], "source": [ "import pandas as pd\n", "import sklearn\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.cross_validation import train_test_split\n", "import os\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "print sklearn.__version__\n", "print pd.__version__" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
symbolingnormalized-lossesmakefuel-typeaspirationnum-of-doorsbody-styledrive-wheelsengine-locationwheel-base...engine-sizefuel-systemborestrokecompression-ratiohorsepowerpeak-rpmcity-mpghighway-mpgprice
02164audigasstdfoursedanfwdfront99.8...109mpfi3.193.410.01025500243013950
12164audigasstdfoursedan4wdfront99.4...136mpfi3.193.48.01155500182217450
21158audigasstdfoursedanfwdfront105.8...136mpfi3.193.48.51105500192517710
31158audigasturbofoursedanfwdfront105.8...131mpfi3.133.48.31405500172023875
42192bmwgasstdtwosedanrwdfront101.2...108mpfi3.502.88.81015800232916430
\n", "

5 rows × 26 columns

\n", "
" ], "text/plain": [ " symboling normalized-losses make fuel-type aspiration num-of-doors \\\n", "0 2 164 audi gas std four \n", "1 2 164 audi gas std four \n", "2 1 158 audi gas std four \n", "3 1 158 audi gas turbo four \n", "4 2 192 bmw gas std two \n", "\n", " body-style drive-wheels engine-location wheel-base ... engine-size \\\n", "0 sedan fwd front 99.8 ... 109 \n", "1 sedan 4wd front 99.4 ... 136 \n", "2 sedan fwd front 105.8 ... 136 \n", "3 sedan fwd front 105.8 ... 131 \n", "4 sedan rwd front 101.2 ... 108 \n", "\n", " fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg \\\n", "0 mpfi 3.19 3.4 10.0 102 5500 24 \n", "1 mpfi 3.19 3.4 8.0 115 5500 18 \n", "2 mpfi 3.19 3.4 8.5 110 5500 19 \n", "3 mpfi 3.13 3.4 8.3 140 5500 17 \n", "4 mpfi 3.50 2.8 8.8 101 5800 23 \n", "\n", " highway-mpg price \n", "0 30 13950 \n", "1 22 17450 \n", "2 25 17710 \n", "3 20 23875 \n", "4 29 16430 \n", "\n", "[5 rows x 26 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "DATA_DIR = '../data'\n", "df = pd.read_table(\n", " os.path.abspath(os.path.join(DATA_DIR, 'day2/automobile.csv')),\n", " sep=','\n", " \n", ")\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(159, 26)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 205 rows, 26 cols\n", "df.shape" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "symboling int64\n", "normalized-losses int64\n", "make object\n", "fuel-type object\n", "aspiration object\n", "num-of-doors object\n", "body-style object\n", "drive-wheels object\n", "engine-location object\n", "wheel-base float64\n", "length float64\n", "width float64\n", "height float64\n", "curb-weight int64\n", "engine-type object\n", "num-of-cylinders object\n", "engine-size int64\n", "fuel-system object\n", "bore float64\n", "stroke float64\n", "compression-ratio float64\n", "horsepower int64\n", "peak-rpm int64\n", "city-mpg int64\n", "highway-mpg int64\n", "price int64\n", "dtype: object" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# datatypes\n", "df.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Experiment 1\n", "\n", "For the first experiment we will just use numerical features as our features for prediction\n", "\n", "So, to summarize\n", "\n", "__Input__: Numerical Values \n", "__Output__: Price\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
symbolingnormalized-losseswheel-baselengthwidthheightcurb-weightengine-sizeborestrokecompression-ratiohorsepowerpeak-rpmcity-mpghighway-mpgprice
0216499.8176.666.254.323371093.193.410.01025500243013950
1216499.4176.666.454.328241363.193.48.01155500182217450
21158105.8192.771.455.728441363.193.48.51105500192517710
31158105.8192.771.455.930861313.133.48.31405500172023875
42192101.2176.864.854.323951083.502.88.81015800232916430
\n", "
" ], "text/plain": [ " symboling normalized-losses wheel-base length width height \\\n", "0 2 164 99.8 176.6 66.2 54.3 \n", "1 2 164 99.4 176.6 66.4 54.3 \n", "2 1 158 105.8 192.7 71.4 55.7 \n", "3 1 158 105.8 192.7 71.4 55.9 \n", "4 2 192 101.2 176.8 64.8 54.3 \n", "\n", " curb-weight engine-size bore stroke compression-ratio horsepower \\\n", "0 2337 109 3.19 3.4 10.0 102 \n", "1 2824 136 3.19 3.4 8.0 115 \n", "2 2844 136 3.19 3.4 8.5 110 \n", "3 3086 131 3.13 3.4 8.3 140 \n", "4 2395 108 3.50 2.8 8.8 101 \n", "\n", " peak-rpm city-mpg highway-mpg price \n", "0 5500 24 30 13950 \n", "1 5500 18 22 17450 \n", "2 5500 19 25 17710 \n", "3 5500 17 20 23875 \n", "4 5800 23 29 16430 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "numerics_dtypes = ['int64', 'float64']\n", "df_rel = df.select_dtypes(include=numerics_dtypes)\n", "df_rel.loc[:,'price'] = df.price\n", "df_rel.head(5)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(159, 16)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we have only 16 columns of 26 that are numeric\n", "df_rel.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Make input and output" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(159, 15)\n", "(159,)\n" ] } ], "source": [ "X = df_rel.iloc[ : , :-1].values\n", "Y = df_rel.iloc[:,-1].values\n", "\n", "print X.shape\n", "print Y.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Train/Test Split" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# ideal practice is to use test as 20% - 30% of training data\n", "# defined by test_size in train_test_split()\n", "# random_state is required to avoid sequential biasness in the data distribution\n", "def data_split(X, Y):\n", " X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10)\n", " return X_train, X_test, Y_train, Y_test\n", "\n", "X_train, X_test, Y_train, Y_test = data_split(X, Y)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "((127, 15), (32, 15))" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.shape, X_test.shape" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [], "source": [ "class Regression:\n", " \n", " def __init__(self):\n", " self.regressor = LinearRegression()\n", " \n", " def train(self, X_train, Y_train):\n", " model = self.regressor.fit(X_train, Y_train)\n", " return model\n", "\n", " def predict(self, model, X_test):\n", " return model.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [], "source": [ "regress = Regression()\n", "model = regress.train(X_train, Y_train)\n", "predictions_train = regress.predict(model, X_train)\n", "predictions_test = regress.predict(model, X_test)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }