{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import time\n", "import pandas as pd\n", "import json\n", "import os\n", "import urllib\n", "import urllib2\n", "import datetime" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "markdown", "metadata": {}, "source": [ "In order to use the reddit API we need to specify an user agent with a description and a link to our project" ] }, { "cell_type": "code", "collapsed": false, "input": [ "user_agent = (\"Project for Data Science class v1.0\" \" https://github.com/jaysayre/intelligentdolphins\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "def json_extract(baseurl, headrs=None, params=None, extraparam=None):\n", " '''\n", " Helper function to download and read json data. Takes in explanatory headers and returns json dict.\n", " '''\n", " if params != None:\n", " if extraparam != None:\n", " params['t'] = extraparam\n", " form = urllib.urlencode(params)\n", " url = baseurl+form\n", " else:\n", " url = baseurl\n", " if headrs != None:\n", " request = urllib2.Request(url, headers=headrs)\n", " else: \n", " request = urllib2.Request(url)\n", " return json.loads(urllib2.urlopen(request).read())\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "'''\n", "appends the data we currently look at to the relevant list\n", "'''\n", "def subreddit_json_parser(data, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates):\n", " for post in data['data']['children']:\n", " if post['kind'] == 't3':\n", " for key, value in post['data'].items():\n", " if key == \"title\":\n", " title.append(value)\n", " elif key == \"id\":\n", " ids.append(str(value))\n", " elif key == \"ups\":\n", " upvts.append(value)\n", " elif key == \"downs\":\n", " downvts.append(value)\n", " elif key == \"author\":\n", " authors.append(value)\n", " elif key == \"num_comments\":\n", " comments.append(value)\n", " elif key == \"score\":\n", " score.append(value)\n", " elif key == \"media\":\n", " media.append(value)\n", " elif key == \"distinguished\":\n", " distin.append(value)\n", " elif key == \"selftext\":\n", " selftxt.append(value)\n", " elif key == \"created_utc\":\n", " dates.append(value)\n", " else:\n", " pass" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "def get_subreddit_df(subreddit, sort_call, user_agent, n, t=None, api_call_limit=100, status=False):\n", " '''\n", " Builds a subreddit dataframe.\n", " \n", " Parameters --\n", " subreddit: specifies which reddit subreddit to pull information from\n", " sort_call: specifies whether you want to sort by top, hot, rising etc.\n", " user_agent: so reddit won't reject our https requests\n", " n: int describing how many top results one would like\n", " t: specifies whether scope should be week, hour, day, year, etc.\n", " api_call_limit: Reddit doesn't serve more than 100 results at a time\n", " status: whether or not you want the status of the download printed in the console\n", " \n", " Returns --\n", " \n", " A pandas dataframe containing various bits of information about the top posts.\n", " \n", " '''\n", " reddit_base = 'http://www.reddit.com/r/%s/%s.json?' % (subreddit, sort_call) #Base api call\n", " headers = {'User-agent': user_agent}\n", " \n", " #Empty lists for information we'll extract\n", " ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates = [], [], [], [], [], [], [], [], [], [], []\n", " \n", " #Makes sure n and api_call_limit aren't floats!\n", " n = int(n) \n", " api_call_limit = int(api_call_limit)\n", " \n", " #Since reddit only provides <= 100 calls at a time, looks at n requested and splits it up into different requests\n", " if n%api_call_limit != 0:\n", " remainder = n%api_call_limit\n", " num = (n/api_call_limit) +1\n", " else:\n", " num = n/api_call_limit\n", " remainder = api_call_limit\n", "\n", " #Makes an api call for all n entries based on the api call limit\n", " for i in range(num):\n", " if i == 0:\n", " post_params = {'limit': api_call_limit} \n", " jsondata = json_extract(reddit_base, headers, post_params, t)\n", " tostartfrom = jsondata['data']['after']\n", " subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates)\n", " time.sleep(2)\n", " if status == True:\n", " print \"Downloaded %s posts...\" % len(set(ids))\n", " elif i == num - 1:\n", " post_params = {'limit': remainder, 'after': tostartfrom} #Indicates the post after we wish to call from\n", " jsondata = json_extract(reddit_base, headers, post_params, t)\n", " subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates)\n", " time.sleep(2)\n", " if status == True:\n", " print \"Downloaded %s posts...\" % len(set(ids))\n", " else: \n", " post_params = {'limit': api_call_limit, 'after': tostartfrom} \n", " jsondata = json_extract(reddit_base, headers, post_params, t)\n", " tostartfrom = jsondata['data']['after']\n", " subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates)\n", " time.sleep(2)\n", " if status == True:\n", " print \"Downloaded %s posts...\" % len(set(ids))\n", " \n", " tempdict = {'id': ids, 'title': title, 'upvotes': upvts, 'media': media, 'distinguished' : distin,\\\n", " 'selftext': selftxt, 'downvotes': downvts, 'comments': comments, 'score': score, 'author': authors, 'time_created': dates}\n", "\n", " return pd.DataFrame(tempdict)\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "# helper function to determine if float\n", "def isfloat(x):\n", " try:\n", " float(x)\n", " return True\n", " except:\n", " return False\n", "\n", "# helper function to clean strings\n", "def filterstr(x):\n", " try:\n", " y= ''.join(e for e in x if (e == ' ' or e.isalnum()) and not e.isdigit())\n", " return y\n", " except:\n", " return ''" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "# Function to clean downloaded information\n", "def subset_subreddit(df):\n", " df = df[df['distinguished'].apply(lambda x: x==None)] # Makes sure poster isn't a moderator (distinguished)\n", " df = df[df['media'].apply(lambda x: x==None)] # Also makes sure post is text based\n", " df = df[df['title'].apply(lambda x: x!=None)] # Makes sure title isn't empty\n", " df = df.drop('distinguished',1)\n", " df = df.drop('media', 1)\n", " # Occasionally, text strings will be in areas where only ints should be. This takes care of that\n", " df=df[df['comments'].apply(lambda x: isfloat(x))]\n", " df=df[df['downvotes'].apply(lambda x: isfloat(x))]\n", " df=df[df['score'].apply(lambda x: isfloat(x))]\n", " df=df[df['upvotes'].apply(lambda x: isfloat(x))]\n", " # Ids can be weird as well, sometimes they manifest as long numbers\n", " # Reddit uses a id system in base 36, so I doubt a reasonable id will be longer than 8\n", " df=df[df['id'].apply(lambda x: len(str(x)) <= 8)]\n", " # Filter out characters that cause problems later\n", " for i in df.index:\n", " df['title'][i] = filterstr(df['title'][i])\n", " df['author'][i] = filterstr(df['author'][i])\n", " df['selftext'][i] = filterstr(df['selftext'][i])\n", " # Removes accented characters\n", " df['title'] = df['title'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))\n", " df['selftext'] = df['selftext'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))\n", " df = df.drop_duplicates() #Remove duplicate entries, should they exist\n", " return df" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "# Names of the popular text-based subreddits we want to look at\n", "subreddits = ['explainlikeimfive', 'AskReddit', 'TalesFromTechsupport', \n", " 'talesFromRetail', 'pettyrevenge', 'askhistorians', \n", " 'askscience', 'tifu', 'nosleep', 'jokes', 'atheism', 'politics']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "#Append date stamp to txt file to indicate when download happened\n", "timeformat = '%m-%d-%y-%H%M'\n", "printdate = datetime.datetime.now().strftime(timeformat)\n", " \n", "with open('Data/New/dltimestamp.txt', 'a') as the_file:\n", " the_file.write(printdate)\n", "\n", "for subreddit in subreddits:\n", " #print subreddit\n", " \n", " # Reddit only serves 1000 posts on a section of a subreddit..\n", " # we will try to get as much as possible\n", " top_all = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'all', status=False)\n", " top_week = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'week', status=False)\n", " top_day = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'day', status=False)\n", " hot = get_subreddit_df(subreddit, 'hot', user_agent, 1000, status=False)\n", " new = get_subreddit_df(subreddit, 'new', user_agent, 1000, status=False)\n", " \n", " #Subset and clean each dataframe\n", " top_all = subset_subreddit(top_all)\n", " top_week = subset_subreddit(top_week)\n", " top_day = subset_subreddit(top_day)\n", " hot = subset_subreddit(hot)\n", " new = subset_subreddit(new)\n", " \n", " # Add columns in each denoting name of subreddit\n", " top_all['subreddit'] = subreddit\n", " top_week['subreddit'] = subreddit\n", " top_day['subreddit'] = subreddit\n", " hot['subreddit'] = subreddit\n", " new['subreddit'] = subreddit\n", " \n", " #Adds columns denoting section of subreddit\n", " top_all['type'] = 'top_all'\n", " top_week['type'] = 'top_week'\n", " top_day['type'] = 'top_day'\n", " hot['type'] = 'hot'\n", " new['type'] = 'new'\n", " \n", " #Write scraped subreddit information to a csv file\n", " dltopall = 'Data/New/' + subreddit + 'top_all.csv'\n", " dltopweek = 'Data/New/' + subreddit + 'top_week.csv'\n", " dltopday = 'Data/New/' + subreddit + 'top_day.csv'\n", " dlhot = 'Data/New/' + subreddit + 'hot.csv'\n", " dlnew = 'Data/New/' + subreddit + 'new.csv'\n", " \n", " top_all.to_csv(dltopall, index=False, encoding='utf-8')\n", " top_week.to_csv(dltopweek, index=False, encoding='utf-8')\n", " top_day.to_csv(dltopday, index=False, encoding='utf-8')\n", " hot.to_csv(dlhot, index=False, encoding='utf-8')\n", " new.to_csv(dlnew, index=False, encoding='utf-8')\n", " " ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "explainlikeimfive\n", "AskReddit" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "TalesFromTechsupport" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "talesFromRetail" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "pettyrevenge" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "askhistorians" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "askscience" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "tifu" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "nosleep" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "jokes" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "atheism" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "politics" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 16 } ], "metadata": {} } ] }