{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Populating the interactive namespace from numpy and matplotlib\n"
     ]
    }
   ],
   "source": [
    "%pylab inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import json, re\n",
    "from collections import defaultdict\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from nltk.corpus import stopwords\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.grid_search import GridSearchCV\n",
    "from sklearn.metrics import roc_curve, auc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "giver_username_if_known                                                                             N/A\n",
       "request_id                                                                                     t3_l25d7\n",
       "body                                                  Hi I am in need of food for my 4 children we a...\n",
       "title                                                           Request Colorado Springs Help Us Please\n",
       "requester_age                                                                                         0\n",
       "requester_days_since_first_post_on_raop_at_request                                                    0\n",
       "requester_number_of_comments_at_request                                                               0\n",
       "requester_number_of_comments_in_raop_at_request                                                       0\n",
       "prior_posts                                                                                           0\n",
       "prior_raop_posts                                                                                      0\n",
       "requester_number_of_subreddits_at_request                                                             0\n",
       "requester_subreddits_at_request                                                                      []\n",
       "karma                                                                                                 0\n",
       "requester_upvotes_plus_downvotes_at_request                                                           0\n",
       "requester_username                                                                            nickylvst\n",
       "unix_timestamp_of_request                                                                    1317852607\n",
       "epoch                                                                                        1317849007\n",
       "_data                                                                                             train\n",
       "got_pizza                                                                                             0\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfs = {}\n",
    "for name in ['train', 'test']:\n",
    "    df = pd.read_json('../data/%s.json' % name)\n",
    "    df['_data'] = name\n",
    "    dfs[name] = df\n",
    "\n",
    "# combine train and test data into one df\n",
    "df = dfs['train'].append(dfs['test'])\n",
    "df = df.reset_index(drop=True)\n",
    "\n",
    "# limit to shared columns (plus predictor)\n",
    "cols = list(dfs['test'].columns) + ['requester_received_pizza']\n",
    "df = df[cols]\n",
    "\n",
    "# rename a few columns to be pithier\n",
    "df.rename(columns={\n",
    "        'request_title': 'title', \n",
    "        'request_text_edit_aware': 'body',\n",
    "        'requester_upvotes_minus_downvotes_at_request': 'karma',\n",
    "        'requester_number_of_posts_at_request': 'prior_posts',\n",
    "        'requester_number_of_posts_on_raop_at_request': 'prior_raop_posts',\n",
    "        'requester_account_age_in_days_at_request': 'requester_age',\n",
    "        'unix_timestamp_of_request_utc': 'epoch',\n",
    "        'requester_received_pizza': 'got_pizza',\n",
    "}, inplace=True)\n",
    "\n",
    "# convert got pizza indicator to ints\n",
    "df['got_pizza'] = df['got_pizza'].apply(lambda x: -1 if pd.isnull(x) else int(x))\n",
    "\n",
    "df.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Request Colorado Springs Help Us Please Hi I am in need of food for my 4 children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated\n",
      "--\n",
      "request colorado springs help us please hi i am in need of food for my children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated\n",
      "--\n"
     ]
    }
   ],
   "source": [
    "# clean up text field (lowercase, letters only)\n",
    "\n",
    "def clean_txt(raw, remove_stop=False):\n",
    "    # remove non-letters\n",
    "    letters_only = re.sub(\"[^a-zA-Z]\", \" \", raw) \n",
    "\n",
    "    # convert to lower case, split into individual words\n",
    "    words = letters_only.lower().split()                             \n",
    "\n",
    "    if remove_stop:\n",
    "        stops = set(stopwords.words(\"english\"))\n",
    "        words = [w for w in words if not w in stops]\n",
    "    \n",
    "    # join cleaned words\n",
    "    return \" \".join(words)\n",
    "\n",
    "\n",
    "# combine title and body columns, then clean\n",
    "df['txt_raw'] = df['title'] + ' ' + df['body']\n",
    "df['txt_clean'] = df['txt_raw'].apply(clean_txt)\n",
    "\n",
    "\n",
    "# check that it worked\n",
    "for col in ['txt_raw', 'txt_clean']:\n",
    "    print df.iloc[0][col]\n",
    "    print '--'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "         date  day  month  community_age\n",
      "0  2011-10-05    5     10            232\n",
      "1  2012-03-25   25      3            404\n",
      "2  2011-10-26   26     10            253\n",
      "3  2011-12-02    2     12            290\n",
      "4  2013-07-12   12      7            878\n"
     ]
    }
   ],
   "source": [
    "# temporal features\n",
    "\n",
    "dt = pd.to_datetime(df['epoch'], unit='s')\n",
    "dt = pd.DatetimeIndex(dt)\n",
    "\n",
    "df['date'] = dt.date\n",
    "df['day'] = dt.day\n",
    "df['month'] = dt.month\n",
    "df['dow'] = dt.dayofweek\n",
    "df['community_age'] = (dt - min(dt)).days.astype(int)\n",
    "\n",
    "temporal_cols = [\n",
    "    'day',\n",
    "    'month',\n",
    "    'community_age',\n",
    "]\n",
    "\n",
    "print df[['date'] + temporal_cols].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "               karma  prior_raop_posts  prior_posts  requester_age\n",
      "count    5671.000000       5671.000000  5671.000000    5671.000000\n",
      "mean     1163.804796          0.058014    21.353024     252.905251\n",
      "std      3486.076626          0.310860    50.203577     302.625587\n",
      "min      -173.000000          0.000000     0.000000       0.000000\n",
      "25%         3.000000          0.000000     0.000000       3.584606\n",
      "50%       170.000000          0.000000     5.000000     155.647593\n",
      "75%      1159.000000          0.000000    22.000000     386.932639\n",
      "max    155010.000000          5.000000   867.000000    2809.750787\n"
     ]
    }
   ],
   "source": [
    "# status features\n",
    "\n",
    "status_cols = [\n",
    "    'karma',\n",
    "    'prior_raop_posts',\n",
    "    'prior_posts',\n",
    "    'requester_age',\n",
    "]\n",
    "\n",
    "print df[status_cols].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cat list: ['craving', 'family', 'job', 'money', 'student']\n",
      "\n",
      "checking word to category lookups:\n",
      "university - categories: ['student']\n",
      "parent - categories: ['family']\n",
      "cash - categories: ['money']\n"
     ]
    }
   ],
   "source": [
    "# narrative groupings from paper\n",
    "# source: http://cs.stanford.edu/~althoff/raop-dataset/altruistic_requests_icwsm.pdf\n",
    "\n",
    "narrative_cats = {\n",
    "    'money': [\n",
    "        'money', 'now', 'broke', 'week', 'until', 'time',\n",
    "        'last', 'day', 'when', 'today', 'tonight', 'paid', 'next',\n",
    "        'first', 'night', 'after', 'tomorrow', 'month', 'while',\n",
    "        'account', 'before', 'long', 'Friday', 'rent', 'buy',\n",
    "        'bank', 'still', 'bills', 'bills', 'ago', 'cash', 'due', 'due',\n",
    "        'soon', 'past', 'never', 'paycheck', 'check', 'spent',\n",
    "        'years', 'poor', 'till', 'yesterday', 'morning', 'dollars',\n",
    "        'financial', 'hour', 'bill', 'evening', 'credit',\n",
    "        'budget', 'loan', 'bucks', 'deposit', 'dollar', 'current',\n",
    "        'payed',        \n",
    "    ],\n",
    "    'job': [\n",
    "        'work', 'job', 'paycheck', 'unemployment', 'interview',\n",
    "        'fired', 'employment', 'hired', 'hire',        \n",
    "    ],\n",
    "    'student': [\n",
    "        'college', 'student', 'school', 'roommate',\n",
    "        'studying', 'university', 'finals', 'semester',\n",
    "        'class', 'study', 'project', 'dorm', 'tuition',        \n",
    "    ],\n",
    "    'family': [\n",
    "        'family', 'mom', 'wife', 'parents', 'mother', 'husband',\n",
    "        'dad', 'son', 'daughter', 'father', 'parent',\n",
    "        'mum',        \n",
    "    ],\n",
    "    'craving': [\n",
    "        'friend', 'girlfriend', 'craving', 'birthday',\n",
    "        'boyfriend', 'celebrate', 'party', 'game', 'games',\n",
    "        'movie', 'date', 'drunk', 'beer', 'celebrating', 'invited',\n",
    "        'drinks', 'crave', 'wasted', 'invite',        \n",
    "    ],\n",
    "}\n",
    "\n",
    "\n",
    "# list of narrative category names\n",
    "cat_list = sorted(narrative_cats.keys())\n",
    "print 'cat list: %s\\n' % cat_list\n",
    "\n",
    "\n",
    "# create word to category mapping\n",
    "word_to_cats = defaultdict(list)\n",
    "for cat, words in narrative_cats.iteritems():\n",
    "    for word in words:\n",
    "        word_to_cats[word].append(cat)\n",
    "word_to_cats = dict(word_to_cats)\n",
    "\n",
    "\n",
    "# check that things are working\n",
    "print 'checking word to category lookups:'\n",
    "for word in ['university', 'parent', 'cash']:\n",
    "    print '%s - categories: %s' % (\n",
    "        word,\n",
    "        word_to_cats.get(word, 'NONE')\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "request colorado springs help us please hi i am in need of food for my children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated\n",
      "{'money': 1, 'family': 2}\n",
      "\n",
      "---\n",
      "\n",
      "request california no cash and i could use some dinner i spent the last money i had on gas today im broke until next thursday\n",
      "{'money': 8}\n",
      "\n",
      "---\n",
      "\n",
      "request hungry couple in dundee scotland would love some pizza my girlfriend decided it would be a good idea to get off at perth bus station when she was coming to visit me and has since had to spend all her money on a taxi to get to me here in dundee any chance some kind soul would get us some pizza since we don t have any cash anymore\n",
      "{'money': 3, 'craving': 1}\n",
      "\n",
      "---\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# loop through cleaned text and count occurrences\n",
    "# of words in each narrative category\n",
    "\n",
    "def categorize(words):\n",
    "    cats = defaultdict(int)\n",
    "    for word in words.split():\n",
    "        matches = word_to_cats.get(word)\n",
    "        if matches:\n",
    "            for m in matches:\n",
    "                cats[m] += 1\n",
    "    return dict(cats)\n",
    "\n",
    "\n",
    "df['txt_cats'] = df['txt_clean'].apply(categorize)\n",
    "\n",
    "\n",
    "# check that it worked\n",
    "for i in range(3):\n",
    "    ex = df.iloc[i]\n",
    "    print ex['txt_clean']\n",
    "    print ex['txt_cats']\n",
    "    print '\\n---\\n'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "txt_cats        {u'money': 1, u'family': 2}\n",
       "narr_craving                              0\n",
       "narr_family                      0.02777778\n",
       "narr_job                                  0\n",
       "narr_money                       0.01388889\n",
       "narr_student                              0\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# turn data dict into indiv columns (narrative features)\n",
    "\n",
    "def to_freq(row, cat):\n",
    "    cats, txt = row['txt_cats'], row['txt_clean']\n",
    "    if cats.get(cat) > 0:\n",
    "        return cats.get(cat) * 1.0 / len(txt.split())\n",
    "    else:\n",
    "        return 0\n",
    "\n",
    "for cat in cat_list:\n",
    "    df['narr_%s' % cat] = df.apply(lambda row: to_freq(row, cat), axis=1)\n",
    "\n",
    "# assign variable to the list of these new cols\n",
    "narrative_cols = [c for c in df.columns if c.startswith('narr_')]\n",
    "\n",
    "# check that it worked\n",
    "df[['txt_cats'] + narrative_cols].iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "polite: {0: 3344, 1: 2327}\n",
      "hyperlink: {0: 5267, 1: 404}\n",
      "reciprocity: {0: 4497, 1: 1174}\n"
     ]
    }
   ],
   "source": [
    "# add a few more, potentially useful features\n",
    "\n",
    "# has link\n",
    "df['hyperlink'] = df['body'].apply(lambda x: 1 if re.search(\"http|www\", x) else 0)\n",
    "\n",
    "# character length of title + body fields\n",
    "df['txt_chars'] = df['txt_clean'].apply(lambda x: len(x))\n",
    "\n",
    "# politeness indicator\n",
    "df['polite'] = df['txt_clean'].apply(lambda x: 1 if re.search(\"thank|appreciate|advance\", x) else 0)\n",
    "\n",
    "# reciprocity indicator\n",
    "df['reciprocity'] = df['txt_clean'].apply(lambda x: \n",
    "                                           1 if re.search(\"repay|pay.+back|pay.+forward|return.+favor\", x) \n",
    "                                           else 0)\n",
    "\n",
    "\n",
    "# check their distributions\n",
    "for col in ['polite', 'hyperlink', 'reciprocity']:\n",
    "    print '%s: %s' % (\n",
    "        col, \n",
    "        df[col].value_counts().to_dict()\n",
    "    )\n",
    "\n",
    "\n",
    "# combine these new cols together\n",
    "additional_cols = [\n",
    "    'txt_chars',\n",
    "    'polite',\n",
    "    'hyperlink',\n",
    "    'reciprocity',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['day', 'month', 'community_age', 'karma', 'prior_raop_posts', 'prior_posts', 'requester_age', 'narr_craving', 'narr_family', 'narr_job', 'narr_money', 'narr_student', 'txt_chars', 'polite', 'hyperlink', 'reciprocity']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>day</th>\n",
       "      <th>month</th>\n",
       "      <th>community_age</th>\n",
       "      <th>karma</th>\n",
       "      <th>prior_raop_posts</th>\n",
       "      <th>prior_posts</th>\n",
       "      <th>requester_age</th>\n",
       "      <th>narr_craving</th>\n",
       "      <th>narr_family</th>\n",
       "      <th>narr_job</th>\n",
       "      <th>narr_money</th>\n",
       "      <th>narr_student</th>\n",
       "      <th>txt_chars</th>\n",
       "      <th>polite</th>\n",
       "      <th>hyperlink</th>\n",
       "      <th>reciprocity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>  5</td>\n",
       "      <td> 10</td>\n",
       "      <td> 232</td>\n",
       "      <td>    0</td>\n",
       "      <td> 0</td>\n",
       "      <td>  0</td>\n",
       "      <td>   0.000000</td>\n",
       "      <td> 0.000000</td>\n",
       "      <td> 0.027778</td>\n",
       "      <td> 0</td>\n",
       "      <td> 0.013889</td>\n",
       "      <td> 0.000000</td>\n",
       "      <td> 354</td>\n",
       "      <td> 1</td>\n",
       "      <td> 0</td>\n",
       "      <td> 0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> 25</td>\n",
       "      <td>  3</td>\n",
       "      <td> 404</td>\n",
       "      <td>   34</td>\n",
       "      <td> 0</td>\n",
       "      <td> 15</td>\n",
       "      <td> 501.111100</td>\n",
       "      <td> 0.000000</td>\n",
       "      <td> 0.000000</td>\n",
       "      <td> 0</td>\n",
       "      <td> 0.320000</td>\n",
       "      <td> 0.000000</td>\n",
       "      <td> 125</td>\n",
       "      <td> 0</td>\n",
       "      <td> 0</td>\n",
       "      <td> 0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> 26</td>\n",
       "      <td> 10</td>\n",
       "      <td> 253</td>\n",
       "      <td>    0</td>\n",
       "      <td> 0</td>\n",
       "      <td>  0</td>\n",
       "      <td>   0.000000</td>\n",
       "      <td> 0.014286</td>\n",
       "      <td> 0.000000</td>\n",
       "      <td> 0</td>\n",
       "      <td> 0.042857</td>\n",
       "      <td> 0.000000</td>\n",
       "      <td> 338</td>\n",
       "      <td> 0</td>\n",
       "      <td> 0</td>\n",
       "      <td> 0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>  2</td>\n",
       "      <td> 12</td>\n",
       "      <td> 290</td>\n",
       "      <td>   54</td>\n",
       "      <td> 0</td>\n",
       "      <td>  1</td>\n",
       "      <td>   6.518438</td>\n",
       "      <td> 0.000000</td>\n",
       "      <td> 0.022222</td>\n",
       "      <td> 0</td>\n",
       "      <td> 0.022222</td>\n",
       "      <td> 0.022222</td>\n",
       "      <td> 227</td>\n",
       "      <td> 0</td>\n",
       "      <td> 0</td>\n",
       "      <td> 0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> 12</td>\n",
       "      <td>  7</td>\n",
       "      <td> 878</td>\n",
       "      <td> 1121</td>\n",
       "      <td> 0</td>\n",
       "      <td> 14</td>\n",
       "      <td> 162.063252</td>\n",
       "      <td> 0.033898</td>\n",
       "      <td> 0.000000</td>\n",
       "      <td> 0</td>\n",
       "      <td> 0.033898</td>\n",
       "      <td> 0.000000</td>\n",
       "      <td> 548</td>\n",
       "      <td> 0</td>\n",
       "      <td> 0</td>\n",
       "      <td> 0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   day  month  community_age  karma  prior_raop_posts  prior_posts  \\\n",
       "0    5     10            232      0                 0            0   \n",
       "1   25      3            404     34                 0           15   \n",
       "2   26     10            253      0                 0            0   \n",
       "3    2     12            290     54                 0            1   \n",
       "4   12      7            878   1121                 0           14   \n",
       "\n",
       "   requester_age  narr_craving  narr_family  narr_job  narr_money  \\\n",
       "0       0.000000      0.000000     0.027778         0    0.013889   \n",
       "1     501.111100      0.000000     0.000000         0    0.320000   \n",
       "2       0.000000      0.014286     0.000000         0    0.042857   \n",
       "3       6.518438      0.000000     0.022222         0    0.022222   \n",
       "4     162.063252      0.033898     0.000000         0    0.033898   \n",
       "\n",
       "   narr_student  txt_chars  polite  hyperlink  reciprocity  \n",
       "0      0.000000        354       1          0            0  \n",
       "1      0.000000        125       0          0            0  \n",
       "2      0.000000        338       0          0            0  \n",
       "3      0.022222        227       0          0            0  \n",
       "4      0.000000        548       0          0            0  "
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# combine features (and check that things look good)\n",
    "x_cols = temporal_cols + status_cols + narrative_cols + additional_cols\n",
    "\n",
    "print x_cols\n",
    "df[x_cols].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# set up framework to quickly iterate on\n",
    "# different feature sets and algorithm params\n",
    "\n",
    "def get_data():\n",
    "    data = df[df['_data'] == 'train'].copy()\n",
    "    return data\n",
    "\n",
    "\n",
    "def prep_data(data, input_cols):\n",
    "    X = data[input_cols].as_matrix()\n",
    "    y = data['got_pizza'].astype(int).as_matrix()\n",
    "    \n",
    "    return X, y\n",
    "\n",
    "    \n",
    "def predict(input_cols, model_params={}):\n",
    "    data = get_data()    \n",
    "    X, y = prep_data(data, input_cols)\n",
    "    rando = 123\n",
    "    \n",
    "    Xr, Xt, yr, yt = train_test_split(X, y, random_state=rando)\n",
    "\n",
    "    model_params.update({\n",
    "        'random_state': rando,\n",
    "    })\n",
    "    model = GradientBoostingClassifier(**model_params)\n",
    "    \n",
    "    model = model.fit(Xr, yr)\n",
    "    ypred = model.predict_proba(Xt)[:, 1]\n",
    "    \n",
    "    fpr, tpr, thresholds = roc_curve(yt, ypred)\n",
    "    auc_val = auc(fpr, tpr)\n",
    "    return auc_val"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.546242789819\n",
      "0.580598346078\n",
      "0.688667943786\n",
      "0.657436579432\n"
     ]
    }
   ],
   "source": [
    "# try out a few different feature sets + model params\n",
    "\n",
    "# just narrative features\n",
    "print predict(narrative_cols)\n",
    "\n",
    "# just temporal features\n",
    "print predict(temporal_cols)\n",
    "\n",
    "# all features\n",
    "print predict(x_cols)\n",
    "\n",
    "# all features with more n_estimators\n",
    "print predict(x_cols, {'n_estimators': 1000})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.693051973072\n",
      "{'n_estimators': 500, 'learning_rate': 0.01, 'max_depth': 4}\n"
     ]
    }
   ],
   "source": [
    "# model parameter tuning\n",
    "# (this takes a little while to run)\n",
    "\n",
    "param_grid = {\n",
    "    'n_estimators': [100, 500, 1000],\n",
    "    'learning_rate': [0.005, 0.01, 0.02],\n",
    "    'max_depth': [2, 3, 4],\n",
    "}\n",
    "\n",
    "model = GradientBoostingClassifier(random_state=123)\n",
    "grid_search = GridSearchCV(model, param_grid, cv=6, verbose=0, scoring='roc_auc')\n",
    "grid_search.fit(X_train, y_train)\n",
    "\n",
    "print grid_search.best_score_\n",
    "print grid_search.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "boom.\n"
     ]
    }
   ],
   "source": [
    "# finally, train classifier over entire training set \n",
    "# with best params from grid search and save predictions\n",
    "\n",
    "df_train = df[df['_data'] == 'train'].copy()\n",
    "X_train = df_train[x_cols].as_matrix()\n",
    "y_train = df_train['got_pizza'].astype(int).as_matrix()\n",
    "\n",
    "model = GradientBoostingClassifier(\n",
    "    n_estimators=500, \n",
    "    learning_rate=0.01, \n",
    "    max_depth=4, \n",
    "    random_state=123\n",
    ")\n",
    "model = model.fit(X_train, y_train)\n",
    "\n",
    "df_test = df[df['_data'] == 'test'].copy()\n",
    "X_test = df_test[x_cols].as_matrix()\n",
    "\n",
    "ypred = model.predict_proba(X_test)[:, 1]\n",
    "df_test['requester_received_pizza'] = ypred\n",
    "\n",
    "final_df = df_test[['request_id', 'requester_received_pizza']]\n",
    "final_df.to_csv('../output/predicted.csv', index=False)\n",
    "print 'boom.'"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}