{ "metadata": { "name": "", "signature": "sha256:238192bc191ccef763a322d61832a22446eec2ca69d1ce26151cdc314e26f1a4" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "#This one reaaaallllyyyy didn't do well. Submission 3 performed much better" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 49 }, { "cell_type": "code", "collapsed": false, "input": [ "%matplotlib inline\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.neighbors import KNeighborsClassifier\n", "pd.set_option('display.max_columns', 50)\n", "\n", "train = pd.DataFrame.from_csv('train.csv')\n", "\n", "###\n", "# Use numerical values for categorical data\n", "\n", "def makeNumerical(input_df):\n", " \n", " output_df = input_df.copy()\n", " non_numeric_cols = [col for col in output_df.columns\\\n", " if output_df[col].dtype != 'int64']\n", " for col in non_numeric_cols:\n", " vals = pd.unique(output_df[col])\n", " for i in xrange(len(vals)):\n", " output_df.loc[(output_df[col] == vals[i]), col] = i \n", "\n", " return output_df" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 19 }, { "cell_type": "code", "collapsed": false, "input": [ "train_numerical = makeNumerical(train)\n", "features = ['T1_V1','T1_V2','T1_V3','T2_V1','T2_V2']\n", "X = train_numerical[features].copy().astype(float)\n", "y = train_numerical[train_numerical.columns[0]].copy().astype(float)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 20 }, { "cell_type": "code", "collapsed": false, "input": [ "train_numerical.sort(['Hazard'], ascending=False).head(10)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
HazardT1_V1T1_V2T1_V3T1_V4T1_V5T1_V6T1_V7T1_V8T1_V9T1_V10T1_V11T1_V12T1_V13T1_V14T1_V15T1_V16T1_V17T2_V1T2_V2T2_V3T2_V4T2_V5T2_V6T2_V7T2_V8T2_V9T2_V10T2_V11T2_V12T2_V13T2_V14T2_V15
Id
28023 69 13 20 3 0 2 1 0 3 2 7 1 0 10 1 0 1 0 50 11 1 8 0 2 31 1 1 5 0 0 1 3 5
73541 64 19 13 4 3 2 1 3 3 2 7 1 1 15 1 0 6 0 57 9 1 11 4 2 40 1 13 3 0 0 1 2 10
67188 63 19 7 1 0 0 1 0 3 0 3 0 0 20 1 0 8 0 15 7 0 6 0 2 28 1 1 3 0 0 1 6 1
61658 52 10 15 1 0 3 1 0 3 0 12 4 1 15 1 0 7 0 20 10 0 7 2 2 37 1 9 1 0 0 0 4 1
18251 51 18 20 9 5 2 0 0 0 4 8 4 0 15 0 2 8 0 40 26 0 16 4 1 25 1 1 3 1 0 2 1 1
47720 49 8 23 2 0 1 1 0 3 2 12 1 0 10 1 0 3 0 54 15 0 11 2 3 37 1 8 7 0 0 1 3 8
66305 46 9 5 4 3 3 1 0 3 0 3 0 0 10 1 0 0 0 84 12 1 14 2 2 22 1 11 2 0 0 1 2 3
24519 46 12 22 1 0 1 1 3 0 2 8 1 0 15 1 0 3 0 43 13 1 16 2 1 31 1 11 3 0 0 3 1 4
33278 44 10 6 3 0 1 0 0 3 3 3 1 0 15 1 0 1 0 38 15 1 4 2 2 31 1 18 3 1 0 2 4 3
22623 42 15 9 6 3 4 0 0 3 2 3 1 0 20 1 0 1 0 7 13 1 11 2 1 34 1 1 3 1 0 1 1 7
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 18, "text": [ " Hazard T1_V1 T1_V2 T1_V3 T1_V4 T1_V5 T1_V6 T1_V7 T1_V8 T1_V9 \\\n", "Id \n", "28023 69 13 20 3 0 2 1 0 3 2 \n", "73541 64 19 13 4 3 2 1 3 3 2 \n", "67188 63 19 7 1 0 0 1 0 3 0 \n", "61658 52 10 15 1 0 3 1 0 3 0 \n", "18251 51 18 20 9 5 2 0 0 0 4 \n", "47720 49 8 23 2 0 1 1 0 3 2 \n", "66305 46 9 5 4 3 3 1 0 3 0 \n", "24519 46 12 22 1 0 1 1 3 0 2 \n", "33278 44 10 6 3 0 1 0 0 3 3 \n", "22623 42 15 9 6 3 4 0 0 3 2 \n", "\n", " T1_V10 T1_V11 T1_V12 T1_V13 T1_V14 T1_V15 T1_V16 T1_V17 T2_V1 \\\n", "Id \n", "28023 7 1 0 10 1 0 1 0 50 \n", "73541 7 1 1 15 1 0 6 0 57 \n", "67188 3 0 0 20 1 0 8 0 15 \n", "61658 12 4 1 15 1 0 7 0 20 \n", "18251 8 4 0 15 0 2 8 0 40 \n", "47720 12 1 0 10 1 0 3 0 54 \n", "66305 3 0 0 10 1 0 0 0 84 \n", "24519 8 1 0 15 1 0 3 0 43 \n", "33278 3 1 0 15 1 0 1 0 38 \n", "22623 3 1 0 20 1 0 1 0 7 \n", "\n", " T2_V2 T2_V3 T2_V4 T2_V5 T2_V6 T2_V7 T2_V8 T2_V9 T2_V10 T2_V11 \\\n", "Id \n", "28023 11 1 8 0 2 31 1 1 5 0 \n", "73541 9 1 11 4 2 40 1 13 3 0 \n", "67188 7 0 6 0 2 28 1 1 3 0 \n", "61658 10 0 7 2 2 37 1 9 1 0 \n", "18251 26 0 16 4 1 25 1 1 3 1 \n", "47720 15 0 11 2 3 37 1 8 7 0 \n", "66305 12 1 14 2 2 22 1 11 2 0 \n", "24519 13 1 16 2 1 31 1 11 3 0 \n", "33278 15 1 4 2 2 31 1 18 3 1 \n", "22623 13 1 11 2 1 34 1 1 3 1 \n", "\n", " T2_V12 T2_V13 T2_V14 T2_V15 \n", "Id \n", "28023 0 1 3 5 \n", "73541 0 1 2 10 \n", "67188 0 1 6 1 \n", "61658 0 0 4 1 \n", "18251 0 2 1 1 \n", "47720 0 1 3 8 \n", "66305 0 1 2 3 \n", "24519 0 3 1 4 \n", "33278 0 2 4 3 \n", "22623 0 1 1 7 " ] } ], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "knn = KNeighborsClassifier(n_neighbors=3, weights='distance')\n", "knn.fit(X,y)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 39, "text": [ "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", " metric_params=None, n_neighbors=3, p=2, weights='distance')" ] } ], "prompt_number": 39 }, { "cell_type": "code", "collapsed": false, "input": [ "# Load test df, predict on it\n", "\n", "test = makeNumerical(pd.DataFrame.from_csv('test.csv')[features])\n", "\n", "preds = knn.predict(test)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 40 }, { "cell_type": "code", "collapsed": false, "input": [ "preds[:10]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 41, "text": [ "array([ 4., 1., 8., 1., 1., 6., 1., 2., 1., 4.])" ] } ], "prompt_number": 41 }, { "cell_type": "code", "collapsed": false, "input": [ "same_preds = knn.predict(X) - y\n", "pd.unique(same_preds)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 48 }, { "cell_type": "code", "collapsed": false, "input": [ "out_df = pd.DataFrame({'Hazard': preds}, index=test.index)\n", "out_df.to_csv(\"submission_4.csv\", index=True)" ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }