{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import time\n", "import pandas as pd\n", "import numpy as np\n", "import json\n", "import os\n", "import urllib\n", "import urllib2\n", "import datetime\n", "\n", "\n", "user_agent = (\"Project for Data Science class v1.0\" \" /u/Valedra\" \" https://github.com/jaysayre/intelligentdolphins\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now that we've downloaded all of the data we need using Redditscraping.ipynb, we now need to assemble it and download other pieces of information. First, lets combine all of the data frames." ] }, { "cell_type": "code", "collapsed": false, "input": [ "file_dir = \"Data/New/\"\n", "\n", "path, dirs, files = os.walk(file_dir).next()\n", "csvfiles = [file_dir + i for i in files if \".csv\" in i ] #Builds a list with .csv files\n", "csvfiles.sort()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "def filemerge(csvfiles):\n", " if len(csvfiles) >= 2:\n", " df = pd.DataFrame()\n", " for csvfile in csvfiles:\n", " dfnew = pd.read_csv(csvfile, encoding='utf-8')\n", " df = df.append(dfnew)\n", " return df\n", " else:\n", " print 'Not enough files'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "filemerge(csvfiles).to_csv('Data/combined.csv', index=False, encoding='utf-8')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next, let's add something here that will download a seperate dataframe containing the total karma each poster in our new, large data frame has." ] }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('Data/combined.csv', encoding='utf-8')\n", "\n", "def add_karma(df, user_agent, display=20, start=0, end=len(df)):\n", " '''\n", "Adds in the karma score for every author, and returns the same df except with the new information.\n", "This part takes a bit longer, as we have to make a get request for every user.\n", "\n", "Parameters --\n", " df: input dataframe\n", " user_agent: same as before\n", " display: How often one wants status updates on the download progress\n", " \n", "Returns --\n", " A pandas dataframe with the same information, in addition to overall and link karma for the post author\n", " '''\n", " \n", " count = 0\n", " dfidlist = list(df.index)\n", " dfidlist = dfidlist[start : end]\n", " df2 = pd.DataFrame({'karma':df['id'], 'link_karma':df['subreddit'], 'author':df['author']})\n", " for i in dfidlist:\n", " try:\n", " reddit_url = 'http://www.reddit.com/user/%s/about.json' % df['author'][i]\n", " headers = {'User-agent': user_agent}\n", " df2['author'][i] = df['author'][i]\n", " jsondata = json_extract(reddit_url, headers)\n", " df2['karma'][i] = jsondata['data']['comment_karma']\n", " try:\n", " df2['link_karma'][i] = jsondata['data']['link_karma']\n", " except:\n", " df2['link_karma'][i] = 0\n", " count += 1\n", " except:\n", " df2['karma'][i] = 0\n", " df2['link_karma'][i] = 0\n", " count += 1\n", " if count%int(display) == 0:\n", " print \"Retrieved karma for %s users\" % count \n", " return df2\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "def json_extract(baseurl, headrs=None, params=None, extraparam=None):\n", " '''\n", " Helper function to download and read json data. Takes in explanatory headers and returns json dict.\n", " '''\n", " if params != None:\n", " if extraparam != None:\n", " params['t'] = extraparam\n", " form = urllib.urlencode(params)\n", " url = baseurl+form\n", " else:\n", " url = baseurl\n", " if headrs != None:\n", " request = urllib2.Request(url, headers=headrs)\n", " else: \n", " request = urllib2.Request(url)\n", " return json.loads(urllib2.urlopen(request).read())\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "#Add in karma... Be careful as this is really quite slow\n", "df = pd.read_csv('Data/combined.csv', encoding='utf-8')\n", "df2 = add_karma(df, user_agent, display=40) #Change the start and end\n", "df2.to_csv('Data/karma.csv', index=False, encoding='utf-8')\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now that we've downloaded the karma information, we can now merge it into our original data set." ] }, { "cell_type": "code", "collapsed": false, "input": [ "fulldf = df\n", "fulldf['karma'] = df2['karma']\n", "fulldf['link_karma'] = df2['link_karma']\n", "fulldf.to_csv('Data/full.csv', index=False, encoding='utf-8')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally, we need to download one more thing... all of the comments related to a given post! Well, not all, but just the top comments for a given post." ] }, { "cell_type": "code", "collapsed": false, "input": [ "def get_comments(subreddit, postid, sort_call, subtype, user_agent):\n", " '''\n", " Parameters --\n", " subreddit: subreddit title\n", " postid: 6 digit id corresponding to the post\n", " sort_call: one of confidence, top, new, hot, controversial, old, random\n", " user_agent: same as before\n", " \n", " Returns --\n", " '''\n", " reddit_base = 'http://www.reddit.com/r/%s/comments/%s.json?' % (subreddit, postid) \n", " headers = {'User-agent': user_agent}\n", " post_params = {'sort': sort_call}\n", " jsondata = json_extract(reddit_base, headers, post_params)\n", " comments, ids, ups, downs, authors, distin = [], [], [], [], [], []\n", " for item in jsondata[1]['data']['children']:\n", " for key, value in item['data'].items():\n", " if key == \"author\":\n", " if value == None:\n", " authors.append('null')\n", " elif value == '[deleted]':\n", " authors.append('null')\n", " else:\n", " authors.append(value)\n", " \n", " elif key == \"id\":\n", " if value == None:\n", " ids.append('null')\n", " elif value == '[deleted]':\n", " ids.append('null')\n", " else:\n", " ids.append(str(value))\n", " elif key == \"body\":\n", " if value == None:\n", " comments.append('null')\n", " elif value == '[deleted]':\n", " comments.append('null')\n", " else:\n", " comments.append(value)#.replace('\\n', ''))\n", " elif key == \"ups\":\n", " if value == None:\n", " ups.append('null')\n", " elif value == '[deleted]':\n", " ups.append('null')\n", " else:\n", " ups.append(value)\n", " elif key == \"downs\":\n", " if value == None:\n", " downs.append('null')\n", " elif value == '[deleted]':\n", " downs.append('null')\n", " else:\n", " downs.append(value)\n", " elif key == \"distinguished\":\n", " if value == None:\n", " distin.append('null')\n", " elif value == '[deleted]':\n", " distin.append('null')\n", " else:\n", " distin.append(value)\n", " else:\n", " pass\n", " \n", " try:\n", " return pd.DataFrame({'comment': comments, 'id': ids, 'ups': ups, 'downs': downs, 'author': authors, \n", " 'distinguished': distin, 'subreddit': subreddit, 'post': postid, 'type': subtype})\n", " except:\n", " #print len(comments), len(ids), len(ups), len(downs), len(authors)\n", " ids.pop(0) #Might need to be more formulaic\n", " return pd.DataFrame({'comment': comments, 'id': ids, 'ups': ups, 'downs': downs, 'author': authors, \n", " 'distinguished': distin, 'subreddit': subreddit, 'post': postid, 'type': subtype})\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "bigdf = df\n", "subs = list(bigdf['subreddit'].unique()) \n", "types = list(bigdf['type'].unique())" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "markdown", "metadata": {}, "source": [ "This will download a seperate dataframe with the comments for each subreddit type. It will take a while, as it has to make a seperate api call for each post and then download those comments." ] }, { "cell_type": "code", "collapsed": false, "input": [ "for sub in subs:\n", " print sub\n", " for typ in types:\n", " print typ\n", " smalldf = bigdf[bigdf['subreddit'] == sub]\n", " smalldf = smalldf[smalldf['type'] == typ]\n", " dfidlist = list(smalldf.index)\n", " comments = pd.DataFrame()\n", " for i in dfidlist:\n", " try:\n", " comments = comments.append(get_comments(smalldf['subreddit'][i], smalldf['id'][i], 'top', smalldf['type'][i], user_agent))\n", " except:\n", " print i\n", " with open('Data/comments/missingcomments.txt', 'a') as the_file:\n", " the_file.write(smalldf['id'][i])\n", " the_file.write(' ' + sub)\n", " the_file.write(' ' + typ + '\\n')\n", " filename = 'Data/comments/' + sub + typ + '.csv'\n", " comments.to_csv(filename, index=False, encoding='utf-8')\n", " " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now that we've downloaded all of the comment files, we can begin to clean them up a little bit." ] }, { "cell_type": "code", "collapsed": false, "input": [ "comment_dir = \"Data/comments/\"\n", "\n", "path, dirs, files = os.walk(comment_dir).next()\n", "commentfiles = [comment_dir + i for i in files if \".csv\" in i ] #Builds a list with .csv files\n", "commentfiles.sort()\n", "\n", "#commentfiles" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "# Function to determine if float\n", "def isfloat(x):\n", " try:\n", " float(x)\n", " return True\n", " except:\n", " return False\n", " \n", "# Function to clean strings\n", "def filterstr(x):\n", " try:\n", " y= ''.join(e for e in x if (e == ' ' or e.isalnum()) and not e.isdigit())\n", " return y\n", " except:\n", " return ''\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 45 }, { "cell_type": "code", "collapsed": false, "input": [ "def subset_comment(df):\n", " # Makes sure commenter isn't a moderator\n", " df = df[df['distinguished'].apply(lambda x: x == 'null')] \n", " df = df.drop('distinguished',1)\n", " # Occasionally, text or null entries will be in areas where only ints should be\n", " df=df[df['downs'].apply(lambda x: isfloat(x))]\n", " df=df[df['ups'].apply(lambda x: isfloat(x))]\n", " # Removes null comments or authors\n", " df = df[df['comment'].apply(lambda x: x != 'null')]\n", " df = df[df['author'].apply(lambda x: x != 'null')]\n", " \n", " # Ids can be weird as well, sometimes they manifest as long numbers\n", " # Reddit uses a id system in base 36, so I doubt any reasonable id will be longer than 8\n", " df=df[df['id'].apply(lambda x: len(str(x)) <= 8)]\n", " df=df[df['post'].apply(lambda x: len(str(x)) <= 8)]\n", " \n", " # Filter out characters that cause problems later, including all punctuation...\n", " for i in df.index:\n", " df['comment'][i] = filterstr(df['comment'][i])\n", " df['author'][i] = filterstr(df['author'][i])\n", " df = df.drop_duplicates() #Remove duplicate entries, should they exist\n", " return df" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 52 }, { "cell_type": "code", "collapsed": false, "input": [ "for commentfile in commentfiles: \n", " commentdf = pd.read_csv(commentfile, encoding='utf-8')\n", " commentdf = subset_comment(commentdf)\n", " commentdf['score'] = commentdf['ups'].astype(int)-commentdf['downs'].astype(int)\n", " commentdf.to_csv(commentfile, index=False, encoding='utf-8')\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 53 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's merge all of the comment files for a given subreddit, which will help us with comment parsing." ] }, { "cell_type": "code", "collapsed": false, "input": [ "ath = ['Data/comments/atheismhot.csv',\n", " 'Data/comments/atheismnew.csv',\n", " 'Data/comments/atheismtop_all.csv',\n", " 'Data/comments/atheismtop_day.csv',\n", " 'Data/comments/atheismtop_week.csv']\n", "poli = ['Data/comments/politicshot.csv',\n", " 'Data/comments/politicsnew.csv',\n", " 'Data/comments/politicstop_all.csv',\n", " 'Data/comments/politicstop_day.csv',\n", " 'Data/comments/politicstop_week.csv']\n", "nos = ['Data/comments/nosleephot.csv',\n", " 'Data/comments/nosleepnew.csv',\n", " 'Data/comments/nosleeptop_all.csv',\n", " 'Data/comments/nosleeptop_day.csv',\n", " 'Data/comments/nosleeptop_week.csv']\n", "ptyr = ['Data/comments/pettyrevengehot.csv',\n", " 'Data/comments/pettyrevengenew.csv',\n", " 'Data/comments/pettyrevengetop_all.csv',\n", " 'Data/comments/pettyrevengetop_day.csv',\n", " 'Data/comments/pettyrevengetop_week.csv']\n", "joke = ['Data/comments/jokeshot.csv',\n", " 'Data/comments/jokesnew.csv',\n", " 'Data/comments/jokestop_all.csv',\n", " 'Data/comments/jokestop_day.csv',\n", " 'Data/comments/jokestop_week.csv']\n", "askh = ['Data/comments/askhistorianshot.csv',\n", " 'Data/comments/askhistoriansnew.csv',\n", " 'Data/comments/askhistorianstop_all.csv',\n", " 'Data/comments/askhistorianstop_day.csv',\n", " 'Data/comments/askhistorianstop_week.csv']\n", "tfts = ['Data/comments/TalesFromTechsupporthot.csv',\n", " 'Data/comments/TalesFromTechsupportnew.csv',\n", " 'Data/comments/TalesFromTechsupporttop_all.csv',\n", " 'Data/comments/TalesFromTechsupporttop_day.csv',\n", " 'Data/comments/TalesFromTechsupporttop_week.csv']\n", "ar = ['Data/comments/AskReddithot.csv',\n", " 'Data/comments/AskRedditnew.csv',\n", " 'Data/comments/AskReddittop_all.csv',\n", " 'Data/comments/AskReddittop_day.csv',\n", " 'Data/comments/AskReddittop_week.csv']\n", "tfr = ['Data/comments/talesFromRetailhot.csv',\n", " 'Data/comments/talesFromRetailnew.csv',\n", " 'Data/comments/talesFromRetailtop_all.csv',\n", " 'Data/comments/talesFromRetailtop_day.csv',\n", " 'Data/comments/talesFromRetailtop_week.csv']\n", "asksci = ['Data/comments/asksciencehot.csv',\n", " 'Data/comments/asksciencenew.csv',\n", " 'Data/comments/asksciencetop_all.csv',\n", " 'Data/comments/asksciencetop_day.csv',\n", " 'Data/comments/asksciencetop_week.csv']\n", "tifu = ['Data/comments/tifuhot.csv',\n", " 'Data/comments/tifunew.csv',\n", " 'Data/comments/tifutop_all.csv',\n", " 'Data/comments/tifutop_day.csv',\n", " 'Data/comments/tifutop_week.csv']\n", "eli5 = ['Data/comments/explainlikeimfivehot.csv',\n", " 'Data/comments/explainlikeimfivenew.csv',\n", " 'Data/comments/explainlikeimfivetop_all.csv',\n", " 'Data/comments/explainlikeimfivetop_day.csv',\n", " 'Data/comments/explainlikeimfivetop_week.csv']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "filemerge(ath).to_csv('Data/combinedcomments/atheism.csv', index=False, encoding='utf-8')\n", "filemerge(poli).to_csv('Data/combinedcomments/politics.csv', index=False, encoding='utf-8')\n", "filemerge(nos).to_csv('Data/combinedcomments/nosleep.csv', index=False, encoding='utf-8')\n", "filemerge(ptyr).to_csv('Data/combinedcomments/pettyrevenge.csv', index=False, encoding='utf-8')\n", "filemerge(joke).to_csv('Data/combinedcomments/jokes.csv', index=False, encoding='utf-8')\n", "filemerge(askh).to_csv('Data/combinedcomments/askhistorians.csv', index=False, encoding='utf-8')\n", "filemerge(tfts).to_csv('Data/combinedcomments/TalesFromTechsupport.csv', index=False, encoding='utf-8')\n", "filemerge(ar).to_csv('Data/combinedcomments/AskReddit.csv', index=False, encoding='utf-8')\n", "filemerge(tfr).to_csv('Data/combinedcomments/talesFromRetail.csv', index=False, encoding='utf-8')\n", "filemerge(asksci).to_csv('Data/combinedcomments/askscience.csv', index=False, encoding='utf-8')\n", "filemerge(tifu).to_csv('Data/combinedcomments/tifu.csv', index=False, encoding='utf-8')\n", "filemerge(eli5).to_csv('Data/combinedcomments/explainlikeimfive.csv', index=False, encoding='utf-8')\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }