{ "metadata": { "name": "", "signature": "sha256:812eb27ae83c7e639470991deb6b3d9c5632ca1eaf429b7758c302ea28dd400b" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "%pylab inline" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: pylab import has clobbered these variables: ['clf']\n", "`%matplotlib` prevents importing * from pylab and numpy\n" ] } ], "prompt_number": 34 }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "from __future__ import division\n", "class Transformations(object):\n", " \"\"\"since these transformations are all related, we'll nest them all under a feature norm class\"\"\"\n", " def mean_at_zero(self, arr):\n", " return np.array([i - np.mean(a) for i in arr])\n", "\n", " def norm_to_min_zero(self, arr):\n", " return np.array([i / max(a) for i in arr])\n", " \n", " def norm_to_absolute_min_zero(self, arr):\n", " \"\"\"should be a range of 0 to 1, where 0 maintains its 0 value\"\"\"\n", " return np.array([(i-min(arr))/(max(arr)-min(arr)) for i in arr])\n", " \n", " def norm_to_neg_pos(self, arr):\n", " \"\"\"should be a range of -1 to 1, where 0 represents the mean\"\"\"\n", " return np.array([(i-mean(arr))/(max(arr)-mean(arr)) for i in arr])\n", " \n", " def norm_by_std(self, arr):\n", " \"\"\"should be a range where 0 represents the mean\"\"\"\n", " return np.array([(i-mean(arr))/std(arr) for i in arr])\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "transformer = Transformations()\n", "a = np.array([1.0, 2.0, 3.0, 4.0, 5.0])\n", "print transformer.norm_to_absolute_min_zero(a) == np.array([0.0, 0.25, 0.5, 0.75, 1.0])\n", "print transformer.norm_to_neg_pos(a) == np.array([-1.0, -0.5, 0.0, 0.5, 1.0])\n", "print transformer.norm_by_std(a) == np.array([-1.414213562373095, -0.7071067811865475, 0.0, 0.7071067811865475, 1.414213562373095])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[ True True True True True]\n", "[ True True True True True]\n", "[ True True True True True]\n" ] } ], "prompt_number": 37 }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "from sklearn import tree\n", "from sklearn.cross_validation import cross_val_score\n", "\n", "# Load in data and create sets. dropping all na columns on the live data set.\n", "lemons = pd.read_csv('./data/lemons.csv')\n", "lemons_oos = pd.read_csv('./data/lemons_oos.csv')\n", "\n", "print lemons.dtypes" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "RefId int64\n", "IsBadBuy int64\n", "PurchDate object\n", "Auction object\n", "VehYear int64\n", "VehicleAge int64\n", "Make object\n", "Model object\n", "Trim object\n", "SubModel object\n", "Color object\n", "Transmission object\n", "WheelTypeID float64\n", "WheelType object\n", "VehOdo int64\n", "Nationality object\n", "Size object\n", "TopThreeAmericanName object\n", "MMRAcquisitionAuctionAveragePrice float64\n", "MMRAcquisitionAuctionCleanPrice float64\n", "MMRAcquisitionRetailAveragePrice float64\n", "MMRAcquisitonRetailCleanPrice float64\n", "MMRCurrentAuctionAveragePrice float64\n", "MMRCurrentAuctionCleanPrice float64\n", "MMRCurrentRetailAveragePrice float64\n", "MMRCurrentRetailCleanPrice float64\n", "PRIMEUNIT object\n", "AUCGUART object\n", "BYRNO int64\n", "VNZIP1 int64\n", "VNST object\n", "VehBCost float64\n", "IsOnlineSale int64\n", "WarrantyCost int64\n", "dtype: object\n" ] } ], "prompt_number": 44 }, { "cell_type": "code", "collapsed": false, "input": [ "lemons = lemons.dropna(axis=1)\n", "# Generating a list of continuous data features from the describe dataframe. \n", "# Then, removing the two non-features (RefId is an index, IsBadBuy is the prediction value)\n", "features = list(lemons.describe().columns)\n", "features.remove('RefId')\n", "features.remove('IsBadBuy')\n", "\n", "best_score = -1\n", "for depth in range(1, 10):\n", " scores = cross_val_score(tree.DecisionTreeClassifier(max_depth=depth, random_state=1234),\n", " lemons[features],\n", " lemons.IsBadBuy,\n", " scoring='roc_auc',\n", " cv=5)\n", " if scores.mean() > best_score:\n", " best_depth = depth\n", " best_score = scores.mean()\n", "\n", "# Is the best score we have better than each DummyClassifier type?\n", "from sklearn import dummy, metrics\n", "for strat in ['stratified', 'most_frequent', 'uniform']:\n", " dummyclf = dummy.DummyClassifier(strategy=strat).fit(lemons[features], lemons.IsBadBuy)\n", " print 'did better than %s?' % strat, metrics.roc_auc_score(lemons.IsBadBuy, dummyclf.predict(lemons[features])) < best_score\n", "\n", "# seems so!\n", "\n", "# Create a classifier and prediction.\n", "clf = tree.DecisionTreeClassifier(max_depth=depth, random_state=1234).fit(lemons[features], lemons.IsBadBuy)\n", "\n", "y_pred = clf.predict(lemons_oos[features])\n", "\n", "# Create a submission\n", "submission = pd.DataFrame({ 'RefId' : lemons_oos.RefId, 'prediction' : y_pred })\n", "submission.to_csv('submission.csv')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "did better than stratified? True\n", "did better than most_frequent? True\n", "did better than uniform? True\n" ] } ], "prompt_number": 45 }, { "cell_type": "code", "collapsed": false, "input": [ "lemons[features]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", " | VehYear | \n", "VehicleAge | \n", "VehOdo | \n", "BYRNO | \n", "VNZIP1 | \n", "VehBCost | \n", "IsOnlineSale | \n", "WarrantyCost | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "2006 | \n", "3 | \n", "89046 | \n", "21973 | \n", "33619 | \n", "7100 | \n", "0 | \n", "1113 | \n", "
1 | \n", "2004 | \n", "5 | \n", "93593 | \n", "19638 | \n", "33619 | \n", "7600 | \n", "0 | \n", "1053 | \n", "
2 | \n", "2005 | \n", "4 | \n", "69367 | \n", "19638 | \n", "33619 | \n", "4000 | \n", "0 | \n", "1020 | \n", "
3 | \n", "2004 | \n", "5 | \n", "81054 | \n", "19638 | \n", "33619 | \n", "5600 | \n", "0 | \n", "594 | \n", "
4 | \n", "2004 | \n", "5 | \n", "65328 | \n", "19638 | \n", "33619 | \n", "4200 | \n", "0 | \n", "533 | \n", "
5 | \n", "2005 | \n", "4 | \n", "79315 | \n", "19638 | \n", "33619 | \n", "5400 | \n", "0 | \n", "1623 | \n", "
6 | \n", "2006 | \n", "3 | \n", "74722 | \n", "19638 | \n", "33619 | \n", "6900 | \n", "0 | \n", "1623 | \n", "
7 | \n", "2003 | \n", "6 | \n", "72132 | \n", "5546 | \n", "33619 | \n", "3300 | \n", "0 | \n", "1455 | \n", "
8 | \n", "2005 | \n", "4 | \n", "80736 | \n", "19638 | \n", "33619 | \n", "6800 | \n", "0 | \n", "1243 | \n", "
9 | \n", "2003 | \n", "6 | \n", "75156 | \n", "5546 | \n", "33619 | \n", "4900 | \n", "0 | \n", "1923 | \n", "
10 | \n", "2004 | \n", "5 | \n", "84498 | \n", "19638 | \n", "33619 | \n", "7100 | \n", "0 | \n", "1243 | \n", "
11 | \n", "2002 | \n", "7 | \n", "66536 | \n", "5546 | \n", "33619 | \n", "5800 | \n", "0 | \n", "2003 | \n", "
12 | \n", "2006 | \n", "3 | \n", "59789 | \n", "5546 | \n", "33619 | \n", "7700 | \n", "0 | \n", "671 | \n", "
13 | \n", "2004 | \n", "5 | \n", "52106 | \n", "5546 | \n", "33619 | \n", "4500 | \n", "0 | \n", "754 | \n", "
14 | \n", "2002 | \n", "7 | \n", "88958 | \n", "5546 | \n", "33619 | \n", "8000 | \n", "0 | \n", "2452 | \n", "
15 | \n", "2004 | \n", "5 | \n", "76173 | \n", "19638 | \n", "33619 | \n", "8800 | \n", "0 | \n", "920 | \n", "
16 | \n", "2005 | \n", "4 | \n", "80064 | \n", "19638 | \n", "33619 | \n", "5600 | \n", "0 | \n", "1763 | \n", "
17 | \n", "2003 | \n", "6 | \n", "77694 | \n", "19638 | \n", "33619 | \n", "8300 | \n", "0 | \n", "1923 | \n", "
18 | \n", "2004 | \n", "5 | \n", "57723 | \n", "19638 | \n", "33619 | \n", "7000 | \n", "0 | \n", "671 | \n", "
19 | \n", "2005 | \n", "4 | \n", "78434 | \n", "19638 | \n", "33619 | \n", "10700 | \n", "0 | \n", "1272 | \n", "
20 | \n", "2001 | \n", "8 | \n", "82944 | \n", "19638 | \n", "33619 | \n", "3600 | \n", "0 | \n", "2322 | \n", "
21 | \n", "2003 | \n", "6 | \n", "55711 | \n", "19638 | \n", "33619 | \n", "5100 | \n", "0 | \n", "971 | \n", "
22 | \n", "2005 | \n", "5 | \n", "76586 | \n", "19638 | \n", "33619 | \n", "8200 | \n", "0 | \n", "1389 | \n", "
23 | \n", "2005 | \n", "5 | \n", "86889 | \n", "19638 | \n", "33619 | \n", "5200 | \n", "0 | \n", "594 | \n", "
24 | \n", "2004 | \n", "6 | \n", "68990 | \n", "19619 | \n", "33619 | \n", "8500 | \n", "0 | \n", "1215 | \n", "
25 | \n", "2008 | \n", "2 | \n", "80949 | \n", "19619 | \n", "33619 | \n", "7900 | \n", "0 | \n", "2152 | \n", "
26 | \n", "2003 | \n", "7 | \n", "59858 | \n", "5546 | \n", "33619 | \n", "4600 | \n", "0 | \n", "1220 | \n", "
27 | \n", "2006 | \n", "4 | \n", "50227 | \n", "19619 | \n", "33619 | \n", "7500 | \n", "0 | \n", "1003 | \n", "
28 | \n", "2006 | \n", "4 | \n", "58024 | \n", "20928 | \n", "33619 | \n", "5600 | \n", "0 | \n", "671 | \n", "
29 | \n", "2006 | \n", "4 | \n", "40919 | \n", "20928 | \n", "33619 | \n", "7700 | \n", "0 | \n", "623 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
51058 | \n", "2002 | \n", "7 | \n", "81794 | \n", "18881 | \n", "30212 | \n", "3400 | \n", "0 | \n", "2063 | \n", "
51059 | \n", "2008 | \n", "1 | \n", "49069 | \n", "18881 | \n", "30212 | \n", "8000 | \n", "0 | \n", "482 | \n", "
51060 | \n", "2006 | \n", "3 | \n", "69007 | \n", "18111 | \n", "30212 | \n", "5600 | \n", "0 | \n", "728 | \n", "
51061 | \n", "2006 | \n", "3 | \n", "62228 | \n", "18111 | \n", "30212 | \n", "4525 | \n", "0 | \n", "533 | \n", "
51062 | \n", "2001 | \n", "8 | \n", "72553 | \n", "18111 | \n", "30212 | \n", "4325 | \n", "0 | \n", "1220 | \n", "
51063 | \n", "2005 | \n", "4 | \n", "72224 | \n", "18111 | \n", "30212 | \n", "6100 | \n", "0 | \n", "1038 | \n", "
51064 | \n", "2006 | \n", "3 | \n", "64020 | \n", "18111 | \n", "30212 | \n", "7440 | \n", "0 | \n", "1703 | \n", "
51065 | \n", "2005 | \n", "4 | \n", "48447 | \n", "18111 | \n", "30212 | \n", "7340 | \n", "0 | \n", "1328 | \n", "
51066 | \n", "2004 | \n", "5 | \n", "81403 | \n", "18111 | \n", "30212 | \n", "8900 | \n", "0 | \n", "983 | \n", "
51067 | \n", "2005 | \n", "4 | \n", "77249 | \n", "18111 | \n", "30212 | \n", "5330 | \n", "0 | \n", "1389 | \n", "
51068 | \n", "2006 | \n", "3 | \n", "60549 | \n", "18111 | \n", "30212 | \n", "5600 | \n", "0 | \n", "533 | \n", "
51069 | \n", "2002 | \n", "7 | \n", "82568 | \n", "18111 | \n", "30212 | \n", "4015 | \n", "0 | \n", "1543 | \n", "
51070 | \n", "2006 | \n", "3 | \n", "64990 | \n", "18111 | \n", "30212 | \n", "6835 | \n", "0 | \n", "1703 | \n", "
51071 | \n", "2007 | \n", "2 | \n", "60074 | \n", "18111 | \n", "30212 | \n", "5700 | \n", "0 | \n", "533 | \n", "
51072 | \n", "2006 | \n", "3 | \n", "84168 | \n", "18881 | \n", "30212 | \n", "4800 | \n", "0 | \n", "1243 | \n", "
51073 | \n", "2007 | \n", "2 | \n", "72802 | \n", "18881 | \n", "30212 | \n", "9500 | \n", "0 | \n", "1389 | \n", "
51074 | \n", "2005 | \n", "4 | \n", "59383 | \n", "18111 | \n", "30212 | \n", "9000 | \n", "0 | \n", "1417 | \n", "
51075 | \n", "2002 | \n", "7 | \n", "75700 | \n", "18881 | \n", "30212 | \n", "7000 | \n", "0 | \n", "1455 | \n", "
51076 | \n", "2006 | \n", "3 | \n", "70004 | \n", "18881 | \n", "30212 | \n", "5000 | \n", "0 | \n", "1155 | \n", "
51077 | \n", "2005 | \n", "4 | \n", "48642 | \n", "18881 | \n", "30212 | \n", "5500 | \n", "0 | \n", "482 | \n", "
51078 | \n", "2006 | \n", "3 | \n", "57444 | \n", "18881 | \n", "30212 | \n", "9800 | \n", "0 | \n", "1251 | \n", "
51079 | \n", "2004 | \n", "5 | \n", "69098 | \n", "18881 | \n", "30212 | \n", "4500 | \n", "0 | \n", "533 | \n", "
51080 | \n", "2004 | \n", "5 | \n", "76391 | \n", "18111 | \n", "30212 | \n", "4200 | \n", "0 | \n", "803 | \n", "
51081 | \n", "2007 | \n", "2 | \n", "44622 | \n", "18881 | \n", "30212 | \n", "6000 | \n", "0 | \n", "482 | \n", "
51082 | \n", "2006 | \n", "3 | \n", "69941 | \n", "18111 | \n", "30212 | \n", "10400 | \n", "0 | \n", "1606 | \n", "
51083 | \n", "2002 | \n", "7 | \n", "93744 | \n", "18111 | \n", "30212 | \n", "7500 | \n", "0 | \n", "1353 | \n", "
51084 | \n", "2007 | \n", "2 | \n", "74407 | \n", "18111 | \n", "30212 | \n", "8000 | \n", "0 | \n", "803 | \n", "
51085 | \n", "2004 | \n", "5 | \n", "82563 | \n", "18881 | \n", "30212 | \n", "7000 | \n", "0 | \n", "1243 | \n", "
51086 | \n", "2006 | \n", "3 | \n", "65399 | \n", "18111 | \n", "30212 | \n", "7900 | \n", "0 | \n", "1508 | \n", "
51087 | \n", "2006 | \n", "3 | \n", "79554 | \n", "18881 | \n", "30212 | \n", "7000 | \n", "0 | \n", "1974 | \n", "
51088 rows \u00d7 8 columns
\n", "\n", " | VehYear | \n", "VehicleAge | \n", "WheelTypeID | \n", "VehOdo | \n", "MMRAcquisitionAuctionAveragePrice | \n", "MMRAcquisitionAuctionCleanPrice | \n", "MMRAcquisitionRetailAveragePrice | \n", "MMRAcquisitonRetailCleanPrice | \n", "MMRCurrentAuctionAveragePrice | \n", "MMRCurrentAuctionCleanPrice | \n", "MMRCurrentRetailAveragePrice | \n", "MMRCurrentRetailCleanPrice | \n", "BYRNO | \n", "VNZIP1 | \n", "VehBCost | \n", "IsOnlineSale | \n", "WarrantyCost | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "2006 | \n", "3 | \n", "1 | \n", "89046 | \n", "8155 | \n", "9829 | \n", "11636 | \n", "13600 | \n", "7451 | \n", "8552 | \n", "11597 | \n", "12409 | \n", "21973 | \n", "33619 | \n", "7100 | \n", "0 | \n", "1113 | \n", "
1 | \n", "2004 | \n", "5 | \n", "1 | \n", "93593 | \n", "6854 | \n", "8383 | \n", "10897 | \n", "12572 | \n", "7456 | \n", "9222 | \n", "11374 | \n", "12791 | \n", "19638 | \n", "33619 | \n", "7600 | \n", "0 | \n", "1053 | \n", "
2 | \n", "2005 | \n", "4 | \n", "2 | \n", "69367 | \n", "3913 | \n", "5054 | \n", "7723 | \n", "8707 | \n", "3247 | \n", "4384 | \n", "6739 | \n", "7911 | \n", "19638 | \n", "33619 | \n", "4000 | \n", "0 | \n", "1020 | \n", "
3 | \n", "2004 | \n", "5 | \n", "2 | \n", "81054 | \n", "3901 | \n", "4908 | \n", "6706 | \n", "8577 | \n", "4709 | \n", "5827 | \n", "8149 | \n", "9451 | \n", "19638 | \n", "33619 | \n", "5600 | \n", "0 | \n", "594 | \n", "
4 | \n", "2004 | \n", "5 | \n", "2 | \n", "65328 | \n", "2966 | \n", "4038 | \n", "6240 | \n", "8496 | \n", "2980 | \n", "4115 | \n", "6230 | \n", "8603 | \n", "19638 | \n", "33619 | \n", "4200 | \n", "0 | \n", "533 | \n", "