{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "from pandas import Series, DataFrame\n", "import pandas as pd\n", "import numpy as np\n", "\n", "import sklearn\n", "from sklearn.ensemble import GradientBoostingClassifier\n", "from sklearn import preprocessing\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 133 }, { "cell_type": "code", "collapsed": false, "input": [ "air_raw = DataFrame.from_csv(\"allyears_tiny.csv\", index_col = False)\n", "print(air_raw.head())\n", "\n", "air_raw['RandNum'] = Series(np.random.uniform(size = len(air_raw['Origin'])))\n", "print(air_raw.head())\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime \\\n", "0 1987 10 14 3 741 730 912 \n", "1 1987 10 15 4 729 730 903 \n", "2 1987 10 17 6 741 730 918 \n", "3 1987 10 18 7 729 730 847 \n", "4 1987 10 19 1 749 730 922 \n", "\n", " CRSArrTime UniqueCarrier FlightNum ... Cancelled \\\n", "0 849 PS 1451 ... 0 \n", "1 849 PS 1451 ... 0 \n", "2 849 PS 1451 ... 0 \n", "3 849 PS 1451 ... 0 \n", "4 849 PS 1451 ... 0 \n", "\n", " CancellationCode Diverted CarrierDelay WeatherDelay NASDelay \\\n", "0 NaN 0 NaN NaN NaN \n", "1 NaN 0 NaN NaN NaN \n", "2 NaN 0 NaN NaN NaN \n", "3 NaN 0 NaN NaN NaN \n", "4 NaN 0 NaN NaN NaN \n", "\n", " SecurityDelay LateAircraftDelay IsArrDelayed IsDepDelayed \n", "0 NaN NaN YES YES \n", "1 NaN NaN YES NO \n", "2 NaN NaN YES YES \n", "3 NaN NaN NO NO \n", "4 NaN NaN YES YES \n", "\n", "[5 rows x 31 columns]\n", " Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime \\\n", "0 1987 10 14 3 741 730 912 \n", "1 1987 10 15 4 729 730 903 \n", "2 1987 10 17 6 741 730 918 \n", "3 1987 10 18 7 729 730 847 \n", "4 1987 10 19 1 749 730 922 \n", "\n", " CRSArrTime UniqueCarrier FlightNum ... CancellationCode Diverted \\\n", "0 849 PS 1451 ... NaN 0 \n", "1 849 PS 1451 ... NaN 0 \n", "2 849 PS 1451 ... NaN 0 \n", "3 849 PS 1451 ... NaN 0 \n", "4 849 PS 1451 ... NaN 0 \n", "\n", " CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay \\\n", "0 NaN NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN NaN \n", "\n", " IsArrDelayed IsDepDelayed RandNum \n", "0 YES YES 0.193944 \n", "1 YES NO 0.466327 \n", "2 YES YES 0.943457 \n", "3 NO NO 0.232673 \n", "4 YES YES 0.133799 \n", "\n", "[5 rows x 32 columns]\n" ] } ], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "air_mapped = DataFrame()\n", "\n", "air_mapped['RandNum'] = air_raw['RandNum']\n", "\n", "air_mapped['IsDepDelayed'] = air_raw['IsDepDelayed']\n", "air_mapped['IsDepDelayedInt'] = air_mapped.apply(lambda row:\n", " 1 if row['IsDepDelayed'] == 'YES' else 0,\n", " axis=1)\n", "del air_mapped['IsDepDelayed']\n", "print(air_mapped.shape)\n", "\n", "lb_origin = sklearn.preprocessing.LabelBinarizer()\n", "lb_origin.fit(air_raw['Origin'])\n", "tmp_origin = lb_origin.transform(air_raw['Origin'])\n", "tmp_origin_df = DataFrame(tmp_origin)\n", "print(tmp_origin_df.shape)\n", "\n", "lb_dest = sklearn.preprocessing.LabelBinarizer()\n", "lb_dest.fit(air_raw['Dest'])\n", "tmp_dest = lb_origin.transform(air_raw['Dest'])\n", "tmp_dest_df = DataFrame(tmp_dest)\n", "print(tmp_dest_df.shape)\n", "\n", "lb_uniquecarrier = sklearn.preprocessing.LabelBinarizer()\n", "lb_uniquecarrier.fit(air_raw['UniqueCarrier'])\n", "tmp_uniquecarrier = lb_origin.transform(air_raw['UniqueCarrier'])\n", "tmp_uniquecarrier_df = DataFrame(tmp_uniquecarrier)\n", "print(tmp_uniquecarrier_df.shape)\n", "\n", "air_mapped = pd.concat([\n", " air_mapped, \n", " tmp_origin_df, \n", " tmp_dest_df, \n", " air_raw['Distance'],\n", " tmp_uniquecarrier_df, \n", " air_raw['Month'],\n", " air_raw['DayofMonth'],\n", " air_raw['DayOfWeek'],\n", " ],\n", " axis=1)\n", "print(air_mapped.shape)\n", "air_mapped\n", "\n", "air = air_mapped" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "(999, 2)\n", "(999, 10)\n", "(999, 10)\n", "(999, 10)\n", "(999, 36)\n" ] } ], "prompt_number": 122 }, { "cell_type": "code", "collapsed": false, "input": [ "air_train = air.ix[air['RandNum'] <= 0.8]\n", "# air_valid = air.ix[(air['RandNum'] > 0.8) & (air['RandNum'] <= 0.9)]\n", "air_test = air.ix[air['RandNum'] > 0.9]\n", "\n", "print(air_train.shape)\n", "print(air_test.shape)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "(824, 36)\n", "(91, 36)\n" ] } ], "prompt_number": 124 }, { "cell_type": "code", "collapsed": false, "input": [ "X_train = air_train.copy(deep=True)\n", "del X_train['RandNum']\n", "del X_train['IsDepDelayedInt']\n", "print(list(X_train.columns.values))\n", "print(X_train.shape)\n", "\n", "y_train = air_train['IsDepDelayedInt']\n", "print(y_train.shape)\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Distance', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Month', 'DayofMonth', 'DayOfWeek']\n", "(824, 34)\n", "(824,)\n" ] } ], "prompt_number": 128 }, { "cell_type": "code", "collapsed": false, "input": [ "clf = GradientBoostingClassifier(n_estimators = 10, max_depth = 3, learning_rate = 0.01)\n", "clf.fit(X_train, y_train)\n" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 132, "text": [ "GradientBoostingClassifier(init=None, learning_rate=0.01, loss='deviance',\n", " max_depth=3, max_features=None, max_leaf_nodes=None,\n", " min_samples_leaf=1, min_samples_split=2, n_estimators=10,\n", " random_state=None, subsample=1.0, verbose=0,\n", " warm_start=False)" ] } ], "prompt_number": 132 }, { "cell_type": "code", "collapsed": false, "input": [ "X_test = air_test.copy(deep=True)\n", "del X_test['RandNum']\n", "del X_test['IsDepDelayedInt']\n", "print(list(X_test.columns.values))\n", "print(X_test.shape)\n", "\n", "print(\"\")\n", "print(\"--- PREDICTIONS ---\")\n", "print(\"\")\n", "pred = clf.predict(X_test)\n", "print(pred)\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Distance', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Month', 'DayofMonth', 'DayOfWeek']\n", "(91, 34)\n", "\n", "--- PREDICTIONS ---\n", "\n", "[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]\n" ] } ], "prompt_number": 137 } ], "metadata": {} } ] }