{ "cells": [ { "cell_type": "code", "execution_count": 89, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] } ], "source": [ "%pylab inline" ] }, { "cell_type": "code", "execution_count": 90, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import json, re\n", "from collections import defaultdict\n", "import pandas as pd\n", "import numpy as np\n", "\n", "from nltk.corpus import stopwords\n", "from sklearn.ensemble import GradientBoostingClassifier\n", "from sklearn.cross_validation import train_test_split\n", "from sklearn.grid_search import GridSearchCV\n", "from sklearn.metrics import roc_curve, auc" ] }, { "cell_type": "code", "execution_count": 91, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "giver_username_if_known N/A\n", "request_id t3_l25d7\n", "body Hi I am in need of food for my 4 children we a...\n", "title Request Colorado Springs Help Us Please\n", "requester_age 0\n", "requester_days_since_first_post_on_raop_at_request 0\n", "requester_number_of_comments_at_request 0\n", "requester_number_of_comments_in_raop_at_request 0\n", "prior_posts 0\n", "prior_raop_posts 0\n", "requester_number_of_subreddits_at_request 0\n", "requester_subreddits_at_request []\n", "karma 0\n", "requester_upvotes_plus_downvotes_at_request 0\n", "requester_username nickylvst\n", "unix_timestamp_of_request 1317852607\n", "epoch 1317849007\n", "_data train\n", "got_pizza 0\n", "Name: 0, dtype: object" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfs = {}\n", "for name in ['train', 'test']:\n", " df = pd.read_json('../data/%s.json' % name)\n", " df['_data'] = name\n", " dfs[name] = df\n", "\n", "# combine train and test data into one df\n", "df = dfs['train'].append(dfs['test'])\n", "df = df.reset_index(drop=True)\n", "\n", "# limit to shared columns (plus predictor)\n", "cols = list(dfs['test'].columns) + ['requester_received_pizza']\n", "df = df[cols]\n", "\n", "# rename a few columns to be pithier\n", "df.rename(columns={\n", " 'request_title': 'title', \n", " 'request_text_edit_aware': 'body',\n", " 'requester_upvotes_minus_downvotes_at_request': 'karma',\n", " 'requester_number_of_posts_at_request': 'prior_posts',\n", " 'requester_number_of_posts_on_raop_at_request': 'prior_raop_posts',\n", " 'requester_account_age_in_days_at_request': 'requester_age',\n", " 'unix_timestamp_of_request_utc': 'epoch',\n", " 'requester_received_pizza': 'got_pizza',\n", "}, inplace=True)\n", "\n", "# convert got pizza indicator to ints\n", "df['got_pizza'] = df['got_pizza'].apply(lambda x: -1 if pd.isnull(x) else int(x))\n", "\n", "df.iloc[0]" ] }, { "cell_type": "code", "execution_count": 92, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Request Colorado Springs Help Us Please Hi I am in need of food for my 4 children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated\n", "--\n", "request colorado springs help us please hi i am in need of food for my children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated\n", "--\n" ] } ], "source": [ "# clean up text field (lowercase, letters only)\n", "\n", "def clean_txt(raw, remove_stop=False):\n", " # remove non-letters\n", " letters_only = re.sub(\"[^a-zA-Z]\", \" \", raw) \n", "\n", " # convert to lower case, split into individual words\n", " words = letters_only.lower().split() \n", "\n", " if remove_stop:\n", " stops = set(stopwords.words(\"english\"))\n", " words = [w for w in words if not w in stops]\n", " \n", " # join cleaned words\n", " return \" \".join(words)\n", "\n", "\n", "# combine title and body columns, then clean\n", "df['txt_raw'] = df['title'] + ' ' + df['body']\n", "df['txt_clean'] = df['txt_raw'].apply(clean_txt)\n", "\n", "\n", "# check that it worked\n", "for col in ['txt_raw', 'txt_clean']:\n", " print df.iloc[0][col]\n", " print '--'" ] }, { "cell_type": "code", "execution_count": 93, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " date day month community_age\n", "0 2011-10-05 5 10 232\n", "1 2012-03-25 25 3 404\n", "2 2011-10-26 26 10 253\n", "3 2011-12-02 2 12 290\n", "4 2013-07-12 12 7 878\n" ] } ], "source": [ "# temporal features\n", "\n", "dt = pd.to_datetime(df['epoch'], unit='s')\n", "dt = pd.DatetimeIndex(dt)\n", "\n", "df['date'] = dt.date\n", "df['day'] = dt.day\n", "df['month'] = dt.month\n", "df['dow'] = dt.dayofweek\n", "df['community_age'] = (dt - min(dt)).days.astype(int)\n", "\n", "temporal_cols = [\n", " 'day',\n", " 'month',\n", " 'community_age',\n", "]\n", "\n", "print df[['date'] + temporal_cols].head()" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " karma prior_raop_posts prior_posts requester_age\n", "count 5671.000000 5671.000000 5671.000000 5671.000000\n", "mean 1163.804796 0.058014 21.353024 252.905251\n", "std 3486.076626 0.310860 50.203577 302.625587\n", "min -173.000000 0.000000 0.000000 0.000000\n", "25% 3.000000 0.000000 0.000000 3.584606\n", "50% 170.000000 0.000000 5.000000 155.647593\n", "75% 1159.000000 0.000000 22.000000 386.932639\n", "max 155010.000000 5.000000 867.000000 2809.750787\n" ] } ], "source": [ "# status features\n", "\n", "status_cols = [\n", " 'karma',\n", " 'prior_raop_posts',\n", " 'prior_posts',\n", " 'requester_age',\n", "]\n", "\n", "print df[status_cols].describe()" ] }, { "cell_type": "code", "execution_count": 95, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cat list: ['craving', 'family', 'job', 'money', 'student']\n", "\n", "checking word to category lookups:\n", "university - categories: ['student']\n", "parent - categories: ['family']\n", "cash - categories: ['money']\n" ] } ], "source": [ "# narrative groupings from paper\n", "# source: http://cs.stanford.edu/~althoff/raop-dataset/altruistic_requests_icwsm.pdf\n", "\n", "narrative_cats = {\n", " 'money': [\n", " 'money', 'now', 'broke', 'week', 'until', 'time',\n", " 'last', 'day', 'when', 'today', 'tonight', 'paid', 'next',\n", " 'first', 'night', 'after', 'tomorrow', 'month', 'while',\n", " 'account', 'before', 'long', 'Friday', 'rent', 'buy',\n", " 'bank', 'still', 'bills', 'bills', 'ago', 'cash', 'due', 'due',\n", " 'soon', 'past', 'never', 'paycheck', 'check', 'spent',\n", " 'years', 'poor', 'till', 'yesterday', 'morning', 'dollars',\n", " 'financial', 'hour', 'bill', 'evening', 'credit',\n", " 'budget', 'loan', 'bucks', 'deposit', 'dollar', 'current',\n", " 'payed', \n", " ],\n", " 'job': [\n", " 'work', 'job', 'paycheck', 'unemployment', 'interview',\n", " 'fired', 'employment', 'hired', 'hire', \n", " ],\n", " 'student': [\n", " 'college', 'student', 'school', 'roommate',\n", " 'studying', 'university', 'finals', 'semester',\n", " 'class', 'study', 'project', 'dorm', 'tuition', \n", " ],\n", " 'family': [\n", " 'family', 'mom', 'wife', 'parents', 'mother', 'husband',\n", " 'dad', 'son', 'daughter', 'father', 'parent',\n", " 'mum', \n", " ],\n", " 'craving': [\n", " 'friend', 'girlfriend', 'craving', 'birthday',\n", " 'boyfriend', 'celebrate', 'party', 'game', 'games',\n", " 'movie', 'date', 'drunk', 'beer', 'celebrating', 'invited',\n", " 'drinks', 'crave', 'wasted', 'invite', \n", " ],\n", "}\n", "\n", "\n", "# list of narrative category names\n", "cat_list = sorted(narrative_cats.keys())\n", "print 'cat list: %s\\n' % cat_list\n", "\n", "\n", "# create word to category mapping\n", "word_to_cats = defaultdict(list)\n", "for cat, words in narrative_cats.iteritems():\n", " for word in words:\n", " word_to_cats[word].append(cat)\n", "word_to_cats = dict(word_to_cats)\n", "\n", "\n", "# check that things are working\n", "print 'checking word to category lookups:'\n", "for word in ['university', 'parent', 'cash']:\n", " print '%s - categories: %s' % (\n", " word,\n", " word_to_cats.get(word, 'NONE')\n", " )" ] }, { "cell_type": "code", "execution_count": 96, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "request colorado springs help us please hi i am in need of food for my children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated\n", "{'money': 1, 'family': 2}\n", "\n", "---\n", "\n", "request california no cash and i could use some dinner i spent the last money i had on gas today im broke until next thursday\n", "{'money': 8}\n", "\n", "---\n", "\n", "request hungry couple in dundee scotland would love some pizza my girlfriend decided it would be a good idea to get off at perth bus station when she was coming to visit me and has since had to spend all her money on a taxi to get to me here in dundee any chance some kind soul would get us some pizza since we don t have any cash anymore\n", "{'money': 3, 'craving': 1}\n", "\n", "---\n", "\n" ] } ], "source": [ "# loop through cleaned text and count occurrences\n", "# of words in each narrative category\n", "\n", "def categorize(words):\n", " cats = defaultdict(int)\n", " for word in words.split():\n", " matches = word_to_cats.get(word)\n", " if matches:\n", " for m in matches:\n", " cats[m] += 1\n", " return dict(cats)\n", "\n", "\n", "df['txt_cats'] = df['txt_clean'].apply(categorize)\n", "\n", "\n", "# check that it worked\n", "for i in range(3):\n", " ex = df.iloc[i]\n", " print ex['txt_clean']\n", " print ex['txt_cats']\n", " print '\\n---\\n'" ] }, { "cell_type": "code", "execution_count": 97, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "txt_cats {u'money': 1, u'family': 2}\n", "narr_craving 0\n", "narr_family 0.02777778\n", "narr_job 0\n", "narr_money 0.01388889\n", "narr_student 0\n", "Name: 0, dtype: object" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# turn data dict into indiv columns (narrative features)\n", "\n", "def to_freq(row, cat):\n", " cats, txt = row['txt_cats'], row['txt_clean']\n", " if cats.get(cat) > 0:\n", " return cats.get(cat) * 1.0 / len(txt.split())\n", " else:\n", " return 0\n", "\n", "for cat in cat_list:\n", " df['narr_%s' % cat] = df.apply(lambda row: to_freq(row, cat), axis=1)\n", "\n", "# assign variable to the list of these new cols\n", "narrative_cols = [c for c in df.columns if c.startswith('narr_')]\n", "\n", "# check that it worked\n", "df[['txt_cats'] + narrative_cols].iloc[0]" ] }, { "cell_type": "code", "execution_count": 98, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "polite: {0: 3344, 1: 2327}\n", "hyperlink: {0: 5267, 1: 404}\n", "reciprocity: {0: 4497, 1: 1174}\n" ] } ], "source": [ "# add a few more, potentially useful features\n", "\n", "# has link\n", "df['hyperlink'] = df['body'].apply(lambda x: 1 if re.search(\"http|www\", x) else 0)\n", "\n", "# character length of title + body fields\n", "df['txt_chars'] = df['txt_clean'].apply(lambda x: len(x))\n", "\n", "# politeness indicator\n", "df['polite'] = df['txt_clean'].apply(lambda x: 1 if re.search(\"thank|appreciate|advance\", x) else 0)\n", "\n", "# reciprocity indicator\n", "df['reciprocity'] = df['txt_clean'].apply(lambda x: \n", " 1 if re.search(\"repay|pay.+back|pay.+forward|return.+favor\", x) \n", " else 0)\n", "\n", "\n", "# check their distributions\n", "for col in ['polite', 'hyperlink', 'reciprocity']:\n", " print '%s: %s' % (\n", " col, \n", " df[col].value_counts().to_dict()\n", " )\n", "\n", "\n", "# combine these new cols together\n", "additional_cols = [\n", " 'txt_chars',\n", " 'polite',\n", " 'hyperlink',\n", " 'reciprocity',\n", "]" ] }, { "cell_type": "code", "execution_count": 99, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['day', 'month', 'community_age', 'karma', 'prior_raop_posts', 'prior_posts', 'requester_age', 'narr_craving', 'narr_family', 'narr_job', 'narr_money', 'narr_student', 'txt_chars', 'polite', 'hyperlink', 'reciprocity']\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
daymonthcommunity_agekarmaprior_raop_postsprior_postsrequester_agenarr_cravingnarr_familynarr_jobnarr_moneynarr_studenttxt_charspolitehyperlinkreciprocity
0 5 10 232 0 0 0 0.000000 0.000000 0.027778 0 0.013889 0.000000 354 1 0 0
1 25 3 404 34 0 15 501.111100 0.000000 0.000000 0 0.320000 0.000000 125 0 0 0
2 26 10 253 0 0 0 0.000000 0.014286 0.000000 0 0.042857 0.000000 338 0 0 0
3 2 12 290 54 0 1 6.518438 0.000000 0.022222 0 0.022222 0.022222 227 0 0 0
4 12 7 878 1121 0 14 162.063252 0.033898 0.000000 0 0.033898 0.000000 548 0 0 0
\n", "
" ], "text/plain": [ " day month community_age karma prior_raop_posts prior_posts \\\n", "0 5 10 232 0 0 0 \n", "1 25 3 404 34 0 15 \n", "2 26 10 253 0 0 0 \n", "3 2 12 290 54 0 1 \n", "4 12 7 878 1121 0 14 \n", "\n", " requester_age narr_craving narr_family narr_job narr_money \\\n", "0 0.000000 0.000000 0.027778 0 0.013889 \n", "1 501.111100 0.000000 0.000000 0 0.320000 \n", "2 0.000000 0.014286 0.000000 0 0.042857 \n", "3 6.518438 0.000000 0.022222 0 0.022222 \n", "4 162.063252 0.033898 0.000000 0 0.033898 \n", "\n", " narr_student txt_chars polite hyperlink reciprocity \n", "0 0.000000 354 1 0 0 \n", "1 0.000000 125 0 0 0 \n", "2 0.000000 338 0 0 0 \n", "3 0.022222 227 0 0 0 \n", "4 0.000000 548 0 0 0 " ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# combine features (and check that things look good)\n", "x_cols = temporal_cols + status_cols + narrative_cols + additional_cols\n", "\n", "print x_cols\n", "df[x_cols].head()" ] }, { "cell_type": "code", "execution_count": 100, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# set up framework to quickly iterate on\n", "# different feature sets and algorithm params\n", "\n", "def get_data():\n", " data = df[df['_data'] == 'train'].copy()\n", " return data\n", "\n", "\n", "def prep_data(data, input_cols):\n", " X = data[input_cols].as_matrix()\n", " y = data['got_pizza'].astype(int).as_matrix()\n", " \n", " return X, y\n", "\n", " \n", "def predict(input_cols, model_params={}):\n", " data = get_data() \n", " X, y = prep_data(data, input_cols)\n", " rando = 123\n", " \n", " Xr, Xt, yr, yt = train_test_split(X, y, random_state=rando)\n", "\n", " model_params.update({\n", " 'random_state': rando,\n", " })\n", " model = GradientBoostingClassifier(**model_params)\n", " \n", " model = model.fit(Xr, yr)\n", " ypred = model.predict_proba(Xt)[:, 1]\n", " \n", " fpr, tpr, thresholds = roc_curve(yt, ypred)\n", " auc_val = auc(fpr, tpr)\n", " return auc_val" ] }, { "cell_type": "code", "execution_count": 101, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.546242789819\n", "0.580598346078\n", "0.688667943786\n", "0.657436579432\n" ] } ], "source": [ "# try out a few different feature sets + model params\n", "\n", "# just narrative features\n", "print predict(narrative_cols)\n", "\n", "# just temporal features\n", "print predict(temporal_cols)\n", "\n", "# all features\n", "print predict(x_cols)\n", "\n", "# all features with more n_estimators\n", "print predict(x_cols, {'n_estimators': 1000})" ] }, { "cell_type": "code", "execution_count": 102, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.693051973072\n", "{'n_estimators': 500, 'learning_rate': 0.01, 'max_depth': 4}\n" ] } ], "source": [ "# model parameter tuning\n", "# (this takes a little while to run)\n", "\n", "param_grid = {\n", " 'n_estimators': [100, 500, 1000],\n", " 'learning_rate': [0.005, 0.01, 0.02],\n", " 'max_depth': [2, 3, 4],\n", "}\n", "\n", "model = GradientBoostingClassifier(random_state=123)\n", "grid_search = GridSearchCV(model, param_grid, cv=6, verbose=0, scoring='roc_auc')\n", "grid_search.fit(X_train, y_train)\n", "\n", "print grid_search.best_score_\n", "print grid_search.best_params_" ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "boom.\n" ] } ], "source": [ "# finally, train classifier over entire training set \n", "# with best params from grid search and save predictions\n", "\n", "df_train = df[df['_data'] == 'train'].copy()\n", "X_train = df_train[x_cols].as_matrix()\n", "y_train = df_train['got_pizza'].astype(int).as_matrix()\n", "\n", "model = GradientBoostingClassifier(\n", " n_estimators=500, \n", " learning_rate=0.01, \n", " max_depth=4, \n", " random_state=123\n", ")\n", "model = model.fit(X_train, y_train)\n", "\n", "df_test = df[df['_data'] == 'test'].copy()\n", "X_test = df_test[x_cols].as_matrix()\n", "\n", "ypred = model.predict_proba(X_test)[:, 1]\n", "df_test['requester_received_pizza'] = ypred\n", "\n", "final_df = df_test[['request_id', 'requester_received_pizza']]\n", "final_df.to_csv('../output/predicted.csv', index=False)\n", "print 'boom.'" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }