{ "metadata": { "name": "", "signature": "sha256:2d56745aec6056b08ab940251ec750304e4ffefa439ffe94ce85d3f6a239b29e" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "%matplotlib inline\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.ensemble import RandomForestClassifier\n", "pd.set_option('display.max_columns', 50)\n", "\n", "train = pd.DataFrame.from_csv('train.csv')\n", "\n", "###\n", "# Use numerical values for categorical data\n", "\n", "def makeNumerical(input_df):\n", " \n", " output_df = input_df.copy()\n", " non_numeric_cols = [col for col in output_df.columns\\\n", " if output_df[col].dtype != 'int64']\n", " for col in non_numeric_cols:\n", " vals = pd.unique(output_df[col])\n", " for i in xrange(len(vals)):\n", " output_df.loc[(output_df[col] == vals[i]), col] = i \n", "\n", " return output_df" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "train_numerical = makeNumerical(train)\n", "features = ['T1_V1','T1_V2','T2_V1','T2_V2']\n", "X = train_numerical[features].copy().astype(float)\n", "y = train_numerical[train_numerical.columns[0]].copy().astype(float)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn import svm\n", "clf = svm.SVR(C = 0.5, gamma=.001) #try making C a little lower, maybe change gamma\n", "clf.fit(X,y)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "# Load test df, predict on it\n", "\n", "test = makeNumerical(pd.DataFrame.from_csv('test.csv')[['T1_V1','T1_V2','T2_V1','T2_V2']])\n", "\n", "preds = clf.predict(test)\n", "#\n", "out_df = pd.DataFrame({'Hazard': preds}, index=test.index)\n", "out_df.to_csv(\"submission_3.csv\", index=True)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 25 }, { "cell_type": "code", "collapsed": false, "input": [ "same_preds = clf.predict(X) - y" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "same_preds.head(20)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 34, "text": [ "Id\n", "1 0.853256\n", "2 -0.998832\n", "3 1.116817\n", "4 2.453021\n", "5 2.615416\n", "12 1.730286\n", "15 -1.931312\n", "19 2.509336\n", "21 0.911379\n", "22 0.386930\n", "23 -5.199475\n", "24 -11.527683\n", "25 1.994604\n", "26 -0.331582\n", "31 -9.513998\n", "32 3.166062\n", "33 1.301354\n", "39 0.776953\n", "41 -1.171212\n", "43 0.810963\n", "Name: Hazard, dtype: float64" ] } ], "prompt_number": 34 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }