{ "cells": [ { "cell_type": "code", "execution_count": 89, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] } ], "source": [ "%pylab inline" ] }, { "cell_type": "code", "execution_count": 90, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import json, re\n", "from collections import defaultdict\n", "import pandas as pd\n", "import numpy as np\n", "\n", "from nltk.corpus import stopwords\n", "from sklearn.ensemble import GradientBoostingClassifier\n", "from sklearn.cross_validation import train_test_split\n", "from sklearn.grid_search import GridSearchCV\n", "from sklearn.metrics import roc_curve, auc" ] }, { "cell_type": "code", "execution_count": 91, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "giver_username_if_known N/A\n", "request_id t3_l25d7\n", "body Hi I am in need of food for my 4 children we a...\n", "title Request Colorado Springs Help Us Please\n", "requester_age 0\n", "requester_days_since_first_post_on_raop_at_request 0\n", "requester_number_of_comments_at_request 0\n", "requester_number_of_comments_in_raop_at_request 0\n", "prior_posts 0\n", "prior_raop_posts 0\n", "requester_number_of_subreddits_at_request 0\n", "requester_subreddits_at_request []\n", "karma 0\n", "requester_upvotes_plus_downvotes_at_request 0\n", "requester_username nickylvst\n", "unix_timestamp_of_request 1317852607\n", "epoch 1317849007\n", "_data train\n", "got_pizza 0\n", "Name: 0, dtype: object" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfs = {}\n", "for name in ['train', 'test']:\n", " df = pd.read_json('../data/%s.json' % name)\n", " df['_data'] = name\n", " dfs[name] = df\n", "\n", "# combine train and test data into one df\n", "df = dfs['train'].append(dfs['test'])\n", "df = df.reset_index(drop=True)\n", "\n", "# limit to shared columns (plus predictor)\n", "cols = list(dfs['test'].columns) + ['requester_received_pizza']\n", "df = df[cols]\n", "\n", "# rename a few columns to be pithier\n", "df.rename(columns={\n", " 'request_title': 'title', \n", " 'request_text_edit_aware': 'body',\n", " 'requester_upvotes_minus_downvotes_at_request': 'karma',\n", " 'requester_number_of_posts_at_request': 'prior_posts',\n", " 'requester_number_of_posts_on_raop_at_request': 'prior_raop_posts',\n", " 'requester_account_age_in_days_at_request': 'requester_age',\n", " 'unix_timestamp_of_request_utc': 'epoch',\n", " 'requester_received_pizza': 'got_pizza',\n", "}, inplace=True)\n", "\n", "# convert got pizza indicator to ints\n", "df['got_pizza'] = df['got_pizza'].apply(lambda x: -1 if pd.isnull(x) else int(x))\n", "\n", "df.iloc[0]" ] }, { "cell_type": "code", "execution_count": 92, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Request Colorado Springs Help Us Please Hi I am in need of food for my 4 children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated\n", "--\n", "request colorado springs help us please hi i am in need of food for my children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated\n", "--\n" ] } ], "source": [ "# clean up text field (lowercase, letters only)\n", "\n", "def clean_txt(raw, remove_stop=False):\n", " # remove non-letters\n", " letters_only = re.sub(\"[^a-zA-Z]\", \" \", raw) \n", "\n", " # convert to lower case, split into individual words\n", " words = letters_only.lower().split() \n", "\n", " if remove_stop:\n", " stops = set(stopwords.words(\"english\"))\n", " words = [w for w in words if not w in stops]\n", " \n", " # join cleaned words\n", " return \" \".join(words)\n", "\n", "\n", "# combine title and body columns, then clean\n", "df['txt_raw'] = df['title'] + ' ' + df['body']\n", "df['txt_clean'] = df['txt_raw'].apply(clean_txt)\n", "\n", "\n", "# check that it worked\n", "for col in ['txt_raw', 'txt_clean']:\n", " print df.iloc[0][col]\n", " print '--'" ] }, { "cell_type": "code", "execution_count": 93, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " date day month community_age\n", "0 2011-10-05 5 10 232\n", "1 2012-03-25 25 3 404\n", "2 2011-10-26 26 10 253\n", "3 2011-12-02 2 12 290\n", "4 2013-07-12 12 7 878\n" ] } ], "source": [ "# temporal features\n", "\n", "dt = pd.to_datetime(df['epoch'], unit='s')\n", "dt = pd.DatetimeIndex(dt)\n", "\n", "df['date'] = dt.date\n", "df['day'] = dt.day\n", "df['month'] = dt.month\n", "df['dow'] = dt.dayofweek\n", "df['community_age'] = (dt - min(dt)).days.astype(int)\n", "\n", "temporal_cols = [\n", " 'day',\n", " 'month',\n", " 'community_age',\n", "]\n", "\n", "print df[['date'] + temporal_cols].head()" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " karma prior_raop_posts prior_posts requester_age\n", "count 5671.000000 5671.000000 5671.000000 5671.000000\n", "mean 1163.804796 0.058014 21.353024 252.905251\n", "std 3486.076626 0.310860 50.203577 302.625587\n", "min -173.000000 0.000000 0.000000 0.000000\n", "25% 3.000000 0.000000 0.000000 3.584606\n", "50% 170.000000 0.000000 5.000000 155.647593\n", "75% 1159.000000 0.000000 22.000000 386.932639\n", "max 155010.000000 5.000000 867.000000 2809.750787\n" ] } ], "source": [ "# status features\n", "\n", "status_cols = [\n", " 'karma',\n", " 'prior_raop_posts',\n", " 'prior_posts',\n", " 'requester_age',\n", "]\n", "\n", "print df[status_cols].describe()" ] }, { "cell_type": "code", "execution_count": 95, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cat list: ['craving', 'family', 'job', 'money', 'student']\n", "\n", "checking word to category lookups:\n", "university - categories: ['student']\n", "parent - categories: ['family']\n", "cash - categories: ['money']\n" ] } ], "source": [ "# narrative groupings from paper\n", "# source: http://cs.stanford.edu/~althoff/raop-dataset/altruistic_requests_icwsm.pdf\n", "\n", "narrative_cats = {\n", " 'money': [\n", " 'money', 'now', 'broke', 'week', 'until', 'time',\n", " 'last', 'day', 'when', 'today', 'tonight', 'paid', 'next',\n", " 'first', 'night', 'after', 'tomorrow', 'month', 'while',\n", " 'account', 'before', 'long', 'Friday', 'rent', 'buy',\n", " 'bank', 'still', 'bills', 'bills', 'ago', 'cash', 'due', 'due',\n", " 'soon', 'past', 'never', 'paycheck', 'check', 'spent',\n", " 'years', 'poor', 'till', 'yesterday', 'morning', 'dollars',\n", " 'financial', 'hour', 'bill', 'evening', 'credit',\n", " 'budget', 'loan', 'bucks', 'deposit', 'dollar', 'current',\n", " 'payed', \n", " ],\n", " 'job': [\n", " 'work', 'job', 'paycheck', 'unemployment', 'interview',\n", " 'fired', 'employment', 'hired', 'hire', \n", " ],\n", " 'student': [\n", " 'college', 'student', 'school', 'roommate',\n", " 'studying', 'university', 'finals', 'semester',\n", " 'class', 'study', 'project', 'dorm', 'tuition', \n", " ],\n", " 'family': [\n", " 'family', 'mom', 'wife', 'parents', 'mother', 'husband',\n", " 'dad', 'son', 'daughter', 'father', 'parent',\n", " 'mum', \n", " ],\n", " 'craving': [\n", " 'friend', 'girlfriend', 'craving', 'birthday',\n", " 'boyfriend', 'celebrate', 'party', 'game', 'games',\n", " 'movie', 'date', 'drunk', 'beer', 'celebrating', 'invited',\n", " 'drinks', 'crave', 'wasted', 'invite', \n", " ],\n", "}\n", "\n", "\n", "# list of narrative category names\n", "cat_list = sorted(narrative_cats.keys())\n", "print 'cat list: %s\\n' % cat_list\n", "\n", "\n", "# create word to category mapping\n", "word_to_cats = defaultdict(list)\n", "for cat, words in narrative_cats.iteritems():\n", " for word in words:\n", " word_to_cats[word].append(cat)\n", "word_to_cats = dict(word_to_cats)\n", "\n", "\n", "# check that things are working\n", "print 'checking word to category lookups:'\n", "for word in ['university', 'parent', 'cash']:\n", " print '%s - categories: %s' % (\n", " word,\n", " word_to_cats.get(word, 'NONE')\n", " )" ] }, { "cell_type": "code", "execution_count": 96, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "request colorado springs help us please hi i am in need of food for my children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated\n", "{'money': 1, 'family': 2}\n", "\n", "---\n", "\n", "request california no cash and i could use some dinner i spent the last money i had on gas today im broke until next thursday\n", "{'money': 8}\n", "\n", "---\n", "\n", "request hungry couple in dundee scotland would love some pizza my girlfriend decided it would be a good idea to get off at perth bus station when she was coming to visit me and has since had to spend all her money on a taxi to get to me here in dundee any chance some kind soul would get us some pizza since we don t have any cash anymore\n", "{'money': 3, 'craving': 1}\n", "\n", "---\n", "\n" ] } ], "source": [ "# loop through cleaned text and count occurrences\n", "# of words in each narrative category\n", "\n", "def categorize(words):\n", " cats = defaultdict(int)\n", " for word in words.split():\n", " matches = word_to_cats.get(word)\n", " if matches:\n", " for m in matches:\n", " cats[m] += 1\n", " return dict(cats)\n", "\n", "\n", "df['txt_cats'] = df['txt_clean'].apply(categorize)\n", "\n", "\n", "# check that it worked\n", "for i in range(3):\n", " ex = df.iloc[i]\n", " print ex['txt_clean']\n", " print ex['txt_cats']\n", " print '\\n---\\n'" ] }, { "cell_type": "code", "execution_count": 97, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "txt_cats {u'money': 1, u'family': 2}\n", "narr_craving 0\n", "narr_family 0.02777778\n", "narr_job 0\n", "narr_money 0.01388889\n", "narr_student 0\n", "Name: 0, dtype: object" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# turn data dict into indiv columns (narrative features)\n", "\n", "def to_freq(row, cat):\n", " cats, txt = row['txt_cats'], row['txt_clean']\n", " if cats.get(cat) > 0:\n", " return cats.get(cat) * 1.0 / len(txt.split())\n", " else:\n", " return 0\n", "\n", "for cat in cat_list:\n", " df['narr_%s' % cat] = df.apply(lambda row: to_freq(row, cat), axis=1)\n", "\n", "# assign variable to the list of these new cols\n", "narrative_cols = [c for c in df.columns if c.startswith('narr_')]\n", "\n", "# check that it worked\n", "df[['txt_cats'] + narrative_cols].iloc[0]" ] }, { "cell_type": "code", "execution_count": 98, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "polite: {0: 3344, 1: 2327}\n", "hyperlink: {0: 5267, 1: 404}\n", "reciprocity: {0: 4497, 1: 1174}\n" ] } ], "source": [ "# add a few more, potentially useful features\n", "\n", "# has link\n", "df['hyperlink'] = df['body'].apply(lambda x: 1 if re.search(\"http|www\", x) else 0)\n", "\n", "# character length of title + body fields\n", "df['txt_chars'] = df['txt_clean'].apply(lambda x: len(x))\n", "\n", "# politeness indicator\n", "df['polite'] = df['txt_clean'].apply(lambda x: 1 if re.search(\"thank|appreciate|advance\", x) else 0)\n", "\n", "# reciprocity indicator\n", "df['reciprocity'] = df['txt_clean'].apply(lambda x: \n", " 1 if re.search(\"repay|pay.+back|pay.+forward|return.+favor\", x) \n", " else 0)\n", "\n", "\n", "# check their distributions\n", "for col in ['polite', 'hyperlink', 'reciprocity']:\n", " print '%s: %s' % (\n", " col, \n", " df[col].value_counts().to_dict()\n", " )\n", "\n", "\n", "# combine these new cols together\n", "additional_cols = [\n", " 'txt_chars',\n", " 'polite',\n", " 'hyperlink',\n", " 'reciprocity',\n", "]" ] }, { "cell_type": "code", "execution_count": 99, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['day', 'month', 'community_age', 'karma', 'prior_raop_posts', 'prior_posts', 'requester_age', 'narr_craving', 'narr_family', 'narr_job', 'narr_money', 'narr_student', 'txt_chars', 'polite', 'hyperlink', 'reciprocity']\n" ] }, { "data": { "text/html": [ "
\n", " | day | \n", "month | \n", "community_age | \n", "karma | \n", "prior_raop_posts | \n", "prior_posts | \n", "requester_age | \n", "narr_craving | \n", "narr_family | \n", "narr_job | \n", "narr_money | \n", "narr_student | \n", "txt_chars | \n", "polite | \n", "hyperlink | \n", "reciprocity | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "5 | \n", "10 | \n", "232 | \n", "0 | \n", "0 | \n", "0 | \n", "0.000000 | \n", "0.000000 | \n", "0.027778 | \n", "0 | \n", "0.013889 | \n", "0.000000 | \n", "354 | \n", "1 | \n", "0 | \n", "0 | \n", "
1 | \n", "25 | \n", "3 | \n", "404 | \n", "34 | \n", "0 | \n", "15 | \n", "501.111100 | \n", "0.000000 | \n", "0.000000 | \n", "0 | \n", "0.320000 | \n", "0.000000 | \n", "125 | \n", "0 | \n", "0 | \n", "0 | \n", "
2 | \n", "26 | \n", "10 | \n", "253 | \n", "0 | \n", "0 | \n", "0 | \n", "0.000000 | \n", "0.014286 | \n", "0.000000 | \n", "0 | \n", "0.042857 | \n", "0.000000 | \n", "338 | \n", "0 | \n", "0 | \n", "0 | \n", "
3 | \n", "2 | \n", "12 | \n", "290 | \n", "54 | \n", "0 | \n", "1 | \n", "6.518438 | \n", "0.000000 | \n", "0.022222 | \n", "0 | \n", "0.022222 | \n", "0.022222 | \n", "227 | \n", "0 | \n", "0 | \n", "0 | \n", "
4 | \n", "12 | \n", "7 | \n", "878 | \n", "1121 | \n", "0 | \n", "14 | \n", "162.063252 | \n", "0.033898 | \n", "0.000000 | \n", "0 | \n", "0.033898 | \n", "0.000000 | \n", "548 | \n", "0 | \n", "0 | \n", "0 | \n", "