{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import time\n",
      "import pandas as pd\n",
      "import json\n",
      "import os\n",
      "import urllib\n",
      "import urllib2\n",
      "import datetime"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "In order to use the reddit API we need to specify an user agent with a description and a link to our project"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "user_agent = (\"Project for Data Science class v1.0\"  \" https://github.com/jaysayre/intelligentdolphins\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def json_extract(baseurl, headrs=None, params=None, extraparam=None):\n",
      "    '''\n",
      "    Helper function to download and read json data. Takes in explanatory headers and returns json dict.\n",
      "    '''\n",
      "    if params != None:\n",
      "        if extraparam != None:\n",
      "                params['t'] = extraparam\n",
      "        form = urllib.urlencode(params)\n",
      "        url = baseurl+form\n",
      "    else:\n",
      "        url = baseurl\n",
      "    if headrs != None:\n",
      "        request = urllib2.Request(url, headers=headrs)\n",
      "    else: \n",
      "        request = urllib2.Request(url)\n",
      "    return json.loads(urllib2.urlopen(request).read())\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "'''\n",
      "appends the data we currently look at to the relevant list\n",
      "'''\n",
      "def subreddit_json_parser(data, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates):\n",
      "        for post in data['data']['children']:\n",
      "            if post['kind'] == 't3':\n",
      "                for key, value in post['data'].items():\n",
      "                    if key == \"title\":\n",
      "                        title.append(value)\n",
      "                    elif key == \"id\":\n",
      "                        ids.append(str(value))\n",
      "                    elif key == \"ups\":\n",
      "                        upvts.append(value)\n",
      "                    elif key == \"downs\":\n",
      "                        downvts.append(value)\n",
      "                    elif key == \"author\":\n",
      "                        authors.append(value)\n",
      "                    elif key == \"num_comments\":\n",
      "                        comments.append(value)\n",
      "                    elif key == \"score\":\n",
      "                        score.append(value)\n",
      "                    elif key == \"media\":\n",
      "                        media.append(value)\n",
      "                    elif key == \"distinguished\":\n",
      "                        distin.append(value)\n",
      "                    elif key == \"selftext\":\n",
      "                        selftxt.append(value)\n",
      "                    elif key == \"created_utc\":\n",
      "                        dates.append(value)\n",
      "                    else:\n",
      "                        pass"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def get_subreddit_df(subreddit, sort_call, user_agent, n, t=None, api_call_limit=100, status=False):\n",
      "    '''\n",
      "    Builds a subreddit dataframe.\n",
      "    \n",
      "    Parameters --\n",
      "    subreddit: specifies which reddit subreddit to pull information from\n",
      "    sort_call: specifies whether you want to sort by top, hot, rising etc.\n",
      "    user_agent: so reddit won't reject our https requests\n",
      "    n: int describing how many top results one would like\n",
      "    t: specifies whether scope should be week, hour, day, year, etc.\n",
      "    api_call_limit: Reddit doesn't serve more than 100 results at a time\n",
      "    status: whether or not you want the status of the download printed in the console\n",
      "    \n",
      "    Returns --\n",
      "    \n",
      "    A pandas dataframe containing various bits of information about the top posts.\n",
      "    \n",
      "    '''\n",
      "    reddit_base = 'http://www.reddit.com/r/%s/%s.json?' % (subreddit, sort_call) #Base api call\n",
      "    headers = {'User-agent': user_agent}\n",
      "    \n",
      "    #Empty lists for information we'll extract\n",
      "    ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates = [], [], [], [], [], [], [], [], [], [], []\n",
      "    \n",
      "    #Makes sure n and api_call_limit aren't floats!\n",
      "    n = int(n) \n",
      "    api_call_limit = int(api_call_limit)\n",
      "    \n",
      "    #Since reddit only provides <= 100 calls at a time, looks at n requested and splits it up into different requests\n",
      "    if n%api_call_limit != 0:\n",
      "        remainder = n%api_call_limit\n",
      "        num = (n/api_call_limit) +1\n",
      "    else:\n",
      "        num = n/api_call_limit\n",
      "        remainder = api_call_limit\n",
      "\n",
      "    #Makes an api call for all n entries based on the api call limit\n",
      "    for i in range(num):\n",
      "        if i == 0:\n",
      "            post_params = {'limit': api_call_limit} \n",
      "            jsondata = json_extract(reddit_base, headers, post_params, t)\n",
      "            tostartfrom = jsondata['data']['after']\n",
      "            subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates)\n",
      "            time.sleep(2)\n",
      "            if status == True:\n",
      "                print \"Downloaded %s posts...\" % len(set(ids))\n",
      "        elif i == num - 1:\n",
      "            post_params = {'limit': remainder, 'after': tostartfrom} #Indicates the post after we wish to call from\n",
      "            jsondata = json_extract(reddit_base, headers, post_params, t)\n",
      "            subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates)\n",
      "            time.sleep(2)\n",
      "            if status == True:\n",
      "                print \"Downloaded %s posts...\" % len(set(ids))\n",
      "        else: \n",
      "            post_params = {'limit': api_call_limit, 'after': tostartfrom} \n",
      "            jsondata = json_extract(reddit_base, headers, post_params, t)\n",
      "            tostartfrom = jsondata['data']['after']\n",
      "            subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates)\n",
      "            time.sleep(2)\n",
      "            if status == True:\n",
      "                print \"Downloaded %s posts...\" % len(set(ids))\n",
      "    \n",
      "    tempdict = {'id': ids, 'title': title, 'upvotes': upvts, 'media': media, 'distinguished' : distin,\\\n",
      "                'selftext': selftxt, 'downvotes': downvts, 'comments': comments, 'score': score, 'author': authors, 'time_created': dates}\n",
      "\n",
      "    return pd.DataFrame(tempdict)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# helper function to determine if float\n",
      "def isfloat(x):\n",
      "    try:\n",
      "        float(x)\n",
      "        return True\n",
      "    except:\n",
      "        return False\n",
      "\n",
      "# helper function to clean strings\n",
      "def filterstr(x):\n",
      "    try:\n",
      "        y= ''.join(e for e in x if (e == ' ' or e.isalnum()) and not e.isdigit())\n",
      "        return y\n",
      "    except:\n",
      "        return ''"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Function to clean downloaded information\n",
      "def subset_subreddit(df):\n",
      "    df = df[df['distinguished'].apply(lambda x: x==None)] # Makes sure poster isn't a moderator (distinguished)\n",
      "    df = df[df['media'].apply(lambda x: x==None)] # Also makes sure post is text based\n",
      "    df = df[df['title'].apply(lambda x: x!=None)] # Makes sure title isn't empty\n",
      "    df = df.drop('distinguished',1)\n",
      "    df = df.drop('media', 1)\n",
      "    # Occasionally, text strings will be in areas where only ints should be. This takes care of that\n",
      "    df=df[df['comments'].apply(lambda x: isfloat(x))]\n",
      "    df=df[df['downvotes'].apply(lambda x: isfloat(x))]\n",
      "    df=df[df['score'].apply(lambda x: isfloat(x))]\n",
      "    df=df[df['upvotes'].apply(lambda x: isfloat(x))]\n",
      "    # Ids can be weird as well, sometimes they manifest as long numbers\n",
      "    # Reddit uses a id system in base 36, so I doubt a reasonable id will be longer than 8\n",
      "    df=df[df['id'].apply(lambda x: len(str(x)) <= 8)]\n",
      "    # Filter out characters that cause problems later\n",
      "    for i in df.index:\n",
      "        df['title'][i] = filterstr(df['title'][i])\n",
      "        df['author'][i] = filterstr(df['author'][i])\n",
      "        df['selftext'][i] = filterstr(df['selftext'][i])\n",
      "    # Removes accented characters\n",
      "    df['title'] = df['title'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))\n",
      "    df['selftext'] = df['selftext'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))\n",
      "    df = df.drop_duplicates() #Remove duplicate entries, should they exist\n",
      "    return df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Names of the popular text-based subreddits we want to look at\n",
      "subreddits = ['explainlikeimfive', 'AskReddit', 'TalesFromTechsupport', \n",
      "              'talesFromRetail', 'pettyrevenge', 'askhistorians', \n",
      "              'askscience', 'tifu', 'nosleep', 'jokes', 'atheism', 'politics']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#Append date stamp to txt file to indicate when download happened\n",
      "timeformat = '%m-%d-%y-%H%M'\n",
      "printdate = datetime.datetime.now().strftime(timeformat)\n",
      "    \n",
      "with open('Data/New/dltimestamp.txt', 'a') as the_file:\n",
      "    the_file.write(printdate)\n",
      "\n",
      "for subreddit in subreddits:\n",
      "    #print subreddit\n",
      "    \n",
      "    # Reddit only serves 1000 posts on a section of a subreddit..\n",
      "    # we will try to get as much as possible\n",
      "    top_all = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'all', status=False)\n",
      "    top_week = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'week', status=False)\n",
      "    top_day = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'day', status=False)\n",
      "    hot = get_subreddit_df(subreddit, 'hot', user_agent, 1000, status=False)\n",
      "    new = get_subreddit_df(subreddit, 'new', user_agent, 1000, status=False)\n",
      "    \n",
      "    #Subset and clean each dataframe\n",
      "    top_all = subset_subreddit(top_all)\n",
      "    top_week = subset_subreddit(top_week)\n",
      "    top_day = subset_subreddit(top_day)\n",
      "    hot = subset_subreddit(hot)\n",
      "    new = subset_subreddit(new)\n",
      "    \n",
      "    # Add columns in each denoting name of subreddit\n",
      "    top_all['subreddit'] = subreddit\n",
      "    top_week['subreddit'] = subreddit\n",
      "    top_day['subreddit'] = subreddit\n",
      "    hot['subreddit'] = subreddit\n",
      "    new['subreddit'] = subreddit\n",
      "    \n",
      "    #Adds columns denoting section of subreddit\n",
      "    top_all['type'] = 'top_all'\n",
      "    top_week['type'] = 'top_week'\n",
      "    top_day['type'] = 'top_day'\n",
      "    hot['type'] = 'hot'\n",
      "    new['type'] = 'new'\n",
      "    \n",
      "    #Write scraped subreddit information to a csv file\n",
      "    dltopall = 'Data/New/' + subreddit + 'top_all.csv'\n",
      "    dltopweek = 'Data/New/' + subreddit + 'top_week.csv'\n",
      "    dltopday = 'Data/New/' + subreddit + 'top_day.csv'\n",
      "    dlhot = 'Data/New/' + subreddit + 'hot.csv'\n",
      "    dlnew = 'Data/New/' + subreddit + 'new.csv'\n",
      "    \n",
      "    top_all.to_csv(dltopall, index=False, encoding='utf-8')\n",
      "    top_week.to_csv(dltopweek, index=False, encoding='utf-8')\n",
      "    top_day.to_csv(dltopday, index=False, encoding='utf-8')\n",
      "    hot.to_csv(dlhot, index=False, encoding='utf-8')\n",
      "    new.to_csv(dlnew, index=False, encoding='utf-8')\n",
      "    "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "explainlikeimfive\n",
        "AskReddit"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "TalesFromTechsupport"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "talesFromRetail"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "pettyrevenge"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "askhistorians"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "askscience"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "tifu"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "nosleep"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "jokes"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "atheism"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "politics"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 16
    }
   ],
   "metadata": {}
  }
 ]
}