{ "metadata": { "name": "", "signature": "sha256:812eb27ae83c7e639470991deb6b3d9c5632ca1eaf429b7758c302ea28dd400b" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "%pylab inline" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: pylab import has clobbered these variables: ['clf']\n", "`%matplotlib` prevents importing * from pylab and numpy\n" ] } ], "prompt_number": 34 }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "from __future__ import division\n", "class Transformations(object):\n", " \"\"\"since these transformations are all related, we'll nest them all under a feature norm class\"\"\"\n", " def mean_at_zero(self, arr):\n", " return np.array([i - np.mean(a) for i in arr])\n", "\n", " def norm_to_min_zero(self, arr):\n", " return np.array([i / max(a) for i in arr])\n", " \n", " def norm_to_absolute_min_zero(self, arr):\n", " \"\"\"should be a range of 0 to 1, where 0 maintains its 0 value\"\"\"\n", " return np.array([(i-min(arr))/(max(arr)-min(arr)) for i in arr])\n", " \n", " def norm_to_neg_pos(self, arr):\n", " \"\"\"should be a range of -1 to 1, where 0 represents the mean\"\"\"\n", " return np.array([(i-mean(arr))/(max(arr)-mean(arr)) for i in arr])\n", " \n", " def norm_by_std(self, arr):\n", " \"\"\"should be a range where 0 represents the mean\"\"\"\n", " return np.array([(i-mean(arr))/std(arr) for i in arr])\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "transformer = Transformations()\n", "a = np.array([1.0, 2.0, 3.0, 4.0, 5.0])\n", "print transformer.norm_to_absolute_min_zero(a) == np.array([0.0, 0.25, 0.5, 0.75, 1.0])\n", "print transformer.norm_to_neg_pos(a) == np.array([-1.0, -0.5, 0.0, 0.5, 1.0])\n", "print transformer.norm_by_std(a) == np.array([-1.414213562373095, -0.7071067811865475, 0.0, 0.7071067811865475, 1.414213562373095])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[ True True True True True]\n", "[ True True True True True]\n", "[ True True True True True]\n" ] } ], "prompt_number": 37 }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "from sklearn import tree\n", "from sklearn.cross_validation import cross_val_score\n", "\n", "# Load in data and create sets. dropping all na columns on the live data set.\n", "lemons = pd.read_csv('./data/lemons.csv')\n", "lemons_oos = pd.read_csv('./data/lemons_oos.csv')\n", "\n", "print lemons.dtypes" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "RefId int64\n", "IsBadBuy int64\n", "PurchDate object\n", "Auction object\n", "VehYear int64\n", "VehicleAge int64\n", "Make object\n", "Model object\n", "Trim object\n", "SubModel object\n", "Color object\n", "Transmission object\n", "WheelTypeID float64\n", "WheelType object\n", "VehOdo int64\n", "Nationality object\n", "Size object\n", "TopThreeAmericanName object\n", "MMRAcquisitionAuctionAveragePrice float64\n", "MMRAcquisitionAuctionCleanPrice float64\n", "MMRAcquisitionRetailAveragePrice float64\n", "MMRAcquisitonRetailCleanPrice float64\n", "MMRCurrentAuctionAveragePrice float64\n", "MMRCurrentAuctionCleanPrice float64\n", "MMRCurrentRetailAveragePrice float64\n", "MMRCurrentRetailCleanPrice float64\n", "PRIMEUNIT object\n", "AUCGUART object\n", "BYRNO int64\n", "VNZIP1 int64\n", "VNST object\n", "VehBCost float64\n", "IsOnlineSale int64\n", "WarrantyCost int64\n", "dtype: object\n" ] } ], "prompt_number": 44 }, { "cell_type": "code", "collapsed": false, "input": [ "lemons = lemons.dropna(axis=1)\n", "# Generating a list of continuous data features from the describe dataframe. \n", "# Then, removing the two non-features (RefId is an index, IsBadBuy is the prediction value)\n", "features = list(lemons.describe().columns)\n", "features.remove('RefId')\n", "features.remove('IsBadBuy')\n", "\n", "best_score = -1\n", "for depth in range(1, 10):\n", " scores = cross_val_score(tree.DecisionTreeClassifier(max_depth=depth, random_state=1234),\n", " lemons[features],\n", " lemons.IsBadBuy,\n", " scoring='roc_auc',\n", " cv=5)\n", " if scores.mean() > best_score:\n", " best_depth = depth\n", " best_score = scores.mean()\n", "\n", "# Is the best score we have better than each DummyClassifier type?\n", "from sklearn import dummy, metrics\n", "for strat in ['stratified', 'most_frequent', 'uniform']:\n", " dummyclf = dummy.DummyClassifier(strategy=strat).fit(lemons[features], lemons.IsBadBuy)\n", " print 'did better than %s?' % strat, metrics.roc_auc_score(lemons.IsBadBuy, dummyclf.predict(lemons[features])) < best_score\n", "\n", "# seems so!\n", "\n", "# Create a classifier and prediction.\n", "clf = tree.DecisionTreeClassifier(max_depth=depth, random_state=1234).fit(lemons[features], lemons.IsBadBuy)\n", "\n", "y_pred = clf.predict(lemons_oos[features])\n", "\n", "# Create a submission\n", "submission = pd.DataFrame({ 'RefId' : lemons_oos.RefId, 'prediction' : y_pred })\n", "submission.to_csv('submission.csv')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "did better than stratified? True\n", "did better than most_frequent? True\n", "did better than uniform? True\n" ] } ], "prompt_number": 45 }, { "cell_type": "code", "collapsed": false, "input": [ "lemons[features]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
VehYearVehicleAgeVehOdoBYRNOVNZIP1VehBCostIsOnlineSaleWarrantyCost
0 2006 3 89046 21973 33619 7100 0 1113
1 2004 5 93593 19638 33619 7600 0 1053
2 2005 4 69367 19638 33619 4000 0 1020
3 2004 5 81054 19638 33619 5600 0 594
4 2004 5 65328 19638 33619 4200 0 533
5 2005 4 79315 19638 33619 5400 0 1623
6 2006 3 74722 19638 33619 6900 0 1623
7 2003 6 72132 5546 33619 3300 0 1455
8 2005 4 80736 19638 33619 6800 0 1243
9 2003 6 75156 5546 33619 4900 0 1923
10 2004 5 84498 19638 33619 7100 0 1243
11 2002 7 66536 5546 33619 5800 0 2003
12 2006 3 59789 5546 33619 7700 0 671
13 2004 5 52106 5546 33619 4500 0 754
14 2002 7 88958 5546 33619 8000 0 2452
15 2004 5 76173 19638 33619 8800 0 920
16 2005 4 80064 19638 33619 5600 0 1763
17 2003 6 77694 19638 33619 8300 0 1923
18 2004 5 57723 19638 33619 7000 0 671
19 2005 4 78434 19638 33619 10700 0 1272
20 2001 8 82944 19638 33619 3600 0 2322
21 2003 6 55711 19638 33619 5100 0 971
22 2005 5 76586 19638 33619 8200 0 1389
23 2005 5 86889 19638 33619 5200 0 594
24 2004 6 68990 19619 33619 8500 0 1215
25 2008 2 80949 19619 33619 7900 0 2152
26 2003 7 59858 5546 33619 4600 0 1220
27 2006 4 50227 19619 33619 7500 0 1003
28 2006 4 58024 20928 33619 5600 0 671
29 2006 4 40919 20928 33619 7700 0 623
...........................
51058 2002 7 81794 18881 30212 3400 0 2063
51059 2008 1 49069 18881 30212 8000 0 482
51060 2006 3 69007 18111 30212 5600 0 728
51061 2006 3 62228 18111 30212 4525 0 533
51062 2001 8 72553 18111 30212 4325 0 1220
51063 2005 4 72224 18111 30212 6100 0 1038
51064 2006 3 64020 18111 30212 7440 0 1703
51065 2005 4 48447 18111 30212 7340 0 1328
51066 2004 5 81403 18111 30212 8900 0 983
51067 2005 4 77249 18111 30212 5330 0 1389
51068 2006 3 60549 18111 30212 5600 0 533
51069 2002 7 82568 18111 30212 4015 0 1543
51070 2006 3 64990 18111 30212 6835 0 1703
51071 2007 2 60074 18111 30212 5700 0 533
51072 2006 3 84168 18881 30212 4800 0 1243
51073 2007 2 72802 18881 30212 9500 0 1389
51074 2005 4 59383 18111 30212 9000 0 1417
51075 2002 7 75700 18881 30212 7000 0 1455
51076 2006 3 70004 18881 30212 5000 0 1155
51077 2005 4 48642 18881 30212 5500 0 482
51078 2006 3 57444 18881 30212 9800 0 1251
51079 2004 5 69098 18881 30212 4500 0 533
51080 2004 5 76391 18111 30212 4200 0 803
51081 2007 2 44622 18881 30212 6000 0 482
51082 2006 3 69941 18111 30212 10400 0 1606
51083 2002 7 93744 18111 30212 7500 0 1353
51084 2007 2 74407 18111 30212 8000 0 803
51085 2004 5 82563 18881 30212 7000 0 1243
51086 2006 3 65399 18111 30212 7900 0 1508
51087 2006 3 79554 18881 30212 7000 0 1974
\n", "

51088 rows \u00d7 8 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 46, "text": [ " VehYear VehicleAge VehOdo BYRNO VNZIP1 VehBCost IsOnlineSale \\\n", "0 2006 3 89046 21973 33619 7100 0 \n", "1 2004 5 93593 19638 33619 7600 0 \n", "2 2005 4 69367 19638 33619 4000 0 \n", "3 2004 5 81054 19638 33619 5600 0 \n", "4 2004 5 65328 19638 33619 4200 0 \n", "5 2005 4 79315 19638 33619 5400 0 \n", "6 2006 3 74722 19638 33619 6900 0 \n", "7 2003 6 72132 5546 33619 3300 0 \n", "8 2005 4 80736 19638 33619 6800 0 \n", "9 2003 6 75156 5546 33619 4900 0 \n", "10 2004 5 84498 19638 33619 7100 0 \n", "11 2002 7 66536 5546 33619 5800 0 \n", "12 2006 3 59789 5546 33619 7700 0 \n", "13 2004 5 52106 5546 33619 4500 0 \n", "14 2002 7 88958 5546 33619 8000 0 \n", "15 2004 5 76173 19638 33619 8800 0 \n", "16 2005 4 80064 19638 33619 5600 0 \n", "17 2003 6 77694 19638 33619 8300 0 \n", "18 2004 5 57723 19638 33619 7000 0 \n", "19 2005 4 78434 19638 33619 10700 0 \n", "20 2001 8 82944 19638 33619 3600 0 \n", "21 2003 6 55711 19638 33619 5100 0 \n", "22 2005 5 76586 19638 33619 8200 0 \n", "23 2005 5 86889 19638 33619 5200 0 \n", "24 2004 6 68990 19619 33619 8500 0 \n", "25 2008 2 80949 19619 33619 7900 0 \n", "26 2003 7 59858 5546 33619 4600 0 \n", "27 2006 4 50227 19619 33619 7500 0 \n", "28 2006 4 58024 20928 33619 5600 0 \n", "29 2006 4 40919 20928 33619 7700 0 \n", "... ... ... ... ... ... ... ... \n", "51058 2002 7 81794 18881 30212 3400 0 \n", "51059 2008 1 49069 18881 30212 8000 0 \n", "51060 2006 3 69007 18111 30212 5600 0 \n", "51061 2006 3 62228 18111 30212 4525 0 \n", "51062 2001 8 72553 18111 30212 4325 0 \n", "51063 2005 4 72224 18111 30212 6100 0 \n", "51064 2006 3 64020 18111 30212 7440 0 \n", "51065 2005 4 48447 18111 30212 7340 0 \n", "51066 2004 5 81403 18111 30212 8900 0 \n", "51067 2005 4 77249 18111 30212 5330 0 \n", "51068 2006 3 60549 18111 30212 5600 0 \n", "51069 2002 7 82568 18111 30212 4015 0 \n", "51070 2006 3 64990 18111 30212 6835 0 \n", "51071 2007 2 60074 18111 30212 5700 0 \n", "51072 2006 3 84168 18881 30212 4800 0 \n", "51073 2007 2 72802 18881 30212 9500 0 \n", "51074 2005 4 59383 18111 30212 9000 0 \n", "51075 2002 7 75700 18881 30212 7000 0 \n", "51076 2006 3 70004 18881 30212 5000 0 \n", "51077 2005 4 48642 18881 30212 5500 0 \n", "51078 2006 3 57444 18881 30212 9800 0 \n", "51079 2004 5 69098 18881 30212 4500 0 \n", "51080 2004 5 76391 18111 30212 4200 0 \n", "51081 2007 2 44622 18881 30212 6000 0 \n", "51082 2006 3 69941 18111 30212 10400 0 \n", "51083 2002 7 93744 18111 30212 7500 0 \n", "51084 2007 2 74407 18111 30212 8000 0 \n", "51085 2004 5 82563 18881 30212 7000 0 \n", "51086 2006 3 65399 18111 30212 7900 0 \n", "51087 2006 3 79554 18881 30212 7000 0 \n", "\n", " WarrantyCost \n", "0 1113 \n", "1 1053 \n", "2 1020 \n", "3 594 \n", "4 533 \n", "5 1623 \n", "6 1623 \n", "7 1455 \n", "8 1243 \n", "9 1923 \n", "10 1243 \n", "11 2003 \n", "12 671 \n", "13 754 \n", "14 2452 \n", "15 920 \n", "16 1763 \n", "17 1923 \n", "18 671 \n", "19 1272 \n", "20 2322 \n", "21 971 \n", "22 1389 \n", "23 594 \n", "24 1215 \n", "25 2152 \n", "26 1220 \n", "27 1003 \n", "28 671 \n", "29 623 \n", "... ... \n", "51058 2063 \n", "51059 482 \n", "51060 728 \n", "51061 533 \n", "51062 1220 \n", "51063 1038 \n", "51064 1703 \n", "51065 1328 \n", "51066 983 \n", "51067 1389 \n", "51068 533 \n", "51069 1543 \n", "51070 1703 \n", "51071 533 \n", "51072 1243 \n", "51073 1389 \n", "51074 1417 \n", "51075 1455 \n", "51076 1155 \n", "51077 482 \n", "51078 1251 \n", "51079 533 \n", "51080 803 \n", "51081 482 \n", "51082 1606 \n", "51083 1353 \n", "51084 803 \n", "51085 1243 \n", "51086 1508 \n", "51087 1974 \n", "\n", "[51088 rows x 8 columns]" ] } ], "prompt_number": 46 }, { "cell_type": "code", "collapsed": false, "input": [ "lemons = pd.read_csv('./data/lemons.csv')\n", "lemons = lemons.drop('PRIMEUNIT',1)\n", "lemons = lemons.drop('AUCGUART',1)\n", "lemons = lemons.dropna(axis=0)\n", "# Generating a list of continuous data features from the describe dataframe. \n", "# Then, removing the two non-features (RefId is an index, IsBadBuy is the prediction value)\n", "features = list(lemons.describe().columns)\n", "features.remove('RefId')\n", "features.remove('IsBadBuy')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [ "lemons[features].head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
VehYearVehicleAgeWheelTypeIDVehOdoMMRAcquisitionAuctionAveragePriceMMRAcquisitionAuctionCleanPriceMMRAcquisitionRetailAveragePriceMMRAcquisitonRetailCleanPriceMMRCurrentAuctionAveragePriceMMRCurrentAuctionCleanPriceMMRCurrentRetailAveragePriceMMRCurrentRetailCleanPriceBYRNOVNZIP1VehBCostIsOnlineSaleWarrantyCost
0 2006 3 1 89046 8155 9829 11636 13600 7451 8552 11597 12409 21973 33619 7100 0 1113
1 2004 5 1 93593 6854 8383 10897 12572 7456 9222 11374 12791 19638 33619 7600 0 1053
2 2005 4 2 69367 3913 5054 7723 8707 3247 4384 6739 7911 19638 33619 4000 0 1020
3 2004 5 2 81054 3901 4908 6706 8577 4709 5827 8149 9451 19638 33619 5600 0 594
4 2004 5 2 65328 2966 4038 6240 8496 2980 4115 6230 8603 19638 33619 4200 0 533
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 28, "text": [ " VehYear VehicleAge WheelTypeID VehOdo \\\n", "0 2006 3 1 89046 \n", "1 2004 5 1 93593 \n", "2 2005 4 2 69367 \n", "3 2004 5 2 81054 \n", "4 2004 5 2 65328 \n", "\n", " MMRAcquisitionAuctionAveragePrice MMRAcquisitionAuctionCleanPrice \\\n", "0 8155 9829 \n", "1 6854 8383 \n", "2 3913 5054 \n", "3 3901 4908 \n", "4 2966 4038 \n", "\n", " MMRAcquisitionRetailAveragePrice MMRAcquisitonRetailCleanPrice \\\n", "0 11636 13600 \n", "1 10897 12572 \n", "2 7723 8707 \n", "3 6706 8577 \n", "4 6240 8496 \n", "\n", " MMRCurrentAuctionAveragePrice MMRCurrentAuctionCleanPrice \\\n", "0 7451 8552 \n", "1 7456 9222 \n", "2 3247 4384 \n", "3 4709 5827 \n", "4 2980 4115 \n", "\n", " MMRCurrentRetailAveragePrice MMRCurrentRetailCleanPrice BYRNO VNZIP1 \\\n", "0 11597 12409 21973 33619 \n", "1 11374 12791 19638 33619 \n", "2 6739 7911 19638 33619 \n", "3 8149 9451 19638 33619 \n", "4 6230 8603 19638 33619 \n", "\n", " VehBCost IsOnlineSale WarrantyCost \n", "0 7100 0 1113 \n", "1 7600 0 1053 \n", "2 4000 0 1020 \n", "3 5600 0 594 \n", "4 4200 0 533 " ] } ], "prompt_number": 28 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn import feature_selection as f_select\n", "significant_features = []\n", "pvals = []\n", "for feature in features:\n", " pval = f_select.f_classif(lemons[feature],lemons.IsBadBuy)\n", " if pval[1][0] < 0.05:\n", " significant_features.append(feature)\n", " pvals.append(pval[1][0])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": [ "print features\n", "significant_features" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "['VehYear', 'VehicleAge', 'WheelTypeID', 'VehOdo', 'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'BYRNO', 'VNZIP1', 'VehBCost', 'IsOnlineSale', 'WarrantyCost']\n" ] }, { "metadata": {}, "output_type": "pyout", "prompt_number": 33, "text": [ "['VehYear',\n", " 'VehicleAge',\n", " 'WheelTypeID',\n", " 'VehOdo',\n", " 'MMRAcquisitionAuctionAveragePrice',\n", " 'MMRAcquisitionAuctionCleanPrice',\n", " 'MMRAcquisitionRetailAveragePrice',\n", " 'MMRAcquisitonRetailCleanPrice',\n", " 'MMRCurrentAuctionAveragePrice',\n", " 'MMRCurrentAuctionCleanPrice',\n", " 'MMRCurrentRetailAveragePrice',\n", " 'MMRCurrentRetailCleanPrice',\n", " 'BYRNO',\n", " 'VehBCost',\n", " 'IsOnlineSale',\n", " 'WarrantyCost']" ] } ], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }