{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "import json\n",
      "import os\n",
      "import urllib\n",
      "import urllib2\n",
      "import numpy as np\n",
      "import unicodedata\n",
      "from myalchemy import MyAlchemy\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#Since we only get so many Alchemy API calls, might as well not call\n",
      "#Alchemy on duplicate posts between subreddits. We'll merge these later......\n",
      "#df = pd.read_csv('Data/full.csv', encoding='utf-8')\n",
      "\n",
      "#print \"Original size of data set is\", len(df)\n",
      "#df = df.drop_duplicates('id') # We only want unique post id entries, not to waste alchemy calls\n",
      "#print \"Size of data set with only unique posts is\", len(df)\n",
      "#subs = list(df['subreddit'].unique())\n",
      "#dflen = len(df)\n",
      "#df['alchemy'] = ['null']*dflen"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "If you are re-running the program, uncomment out the cell block above and simply use the cell block below instead. "
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#df = pd.read_csv('Data/uniqueentries.csv', encoding='utf-8')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "file_dir = \"Data/combinedcomments/\"\n",
      "\n",
      "path, dirs, files = os.walk(file_dir).next()\n",
      "csvfiles = [file_dir + i for i in files if \".csv\" in i ] #Builds a list with .csv files\n",
      "csvfiles.sort()\n",
      "#csvfiles"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def check_null(x):\n",
      "    try:\n",
      "        np.isnan(x)\n",
      "        return False\n",
      "    except:\n",
      "        return True"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def alchemy_comments(df, start, apikey, csvfiles, end=len(df)):\n",
      "    '''\n",
      "    will run the alchemy keyword annalysis on the comment files\n",
      "    '''\n",
      "    p = MyAlchemy(apikey)\n",
      "    dfids = list(df.index)\n",
      "    for i in range(start, end):\n",
      "        subrequest = df['subreddit'][dfids[i]] \n",
      "        commentfile = ''\n",
      "        for comb in csvfiles:\n",
      "            if subrequest in comb:\n",
      "                commentfile = comb\n",
      "        commentdf = pd.read_csv(commentfile, encoding='utf-8')\n",
      "        commentdf = commentdf.drop('type',1)\n",
      "        commentdf = commentdf.drop_duplicates()\n",
      "        commentdf = commentdf[commentdf['comment'].apply(lambda x: check_null(x))]\n",
      "        commentdf['comment'] = commentdf['comment'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))\n",
      "        commentdf = commentdf[commentdf['post'] == df['id'][dfids[i]]]\n",
      "        comments = list(commentdf['comment'])\n",
      "        # If we want to add the title to the alchemy call\n",
      "        comments.append(df['title'][dfids[i]])\n",
      "        # If we want to add the self text to the alchemy call \n",
      "        if check_null(df['selftext'][dfids[i]]):\n",
      "            comments.append(df['selftext'][dfids[i]])\n",
      "            \n",
      "        # Both joining the comments and sending alchemy calls can be problematic    \n",
      "        try:\n",
      "            comments = ' '.join(comments)\n",
      "            if len(comments) > 8000:\n",
      "                comments = comments[0:7999]\n",
      "        except:\n",
      "            print \"Comment join error\", comments\n",
      "        \n",
      "        # I'm not sure what Alchemy does once you reach the cap... so you might want to check if null or something here.\n",
      "        try:\n",
      "            df['alchemy'][dfids[i]] = p.run_method(comments, 'keywords', {'keywordExtractMode':'strict'})\n",
      "        except:\n",
      "            print \"Alchemy error\", df['id'][dfids[i]]\n",
      "        \n",
      "    return df\n",
      "        "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "This section calls the alchemy keyword grabber for each post in the unique data set."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#Alchemy keys\n",
      "apikey1 = \"dcac82649daaa2627ee783b25779cfaed4af0067\" #Jay's key\n",
      "apikey2 = \"e945cef59338f9e8e7bc962badde170e623fb7e5\" #Basti's key\n",
      "apikey3 = \"cb736ca44e57cd6764b70ec86886f4fce8f6a68d\" #Serguei's Key"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 7
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Each alchemy key only works for 1000 calls so this part must be done by multiple people over many days."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#df = alchemy_comments(df, 25000, apikey2, csvfiles, end=25992)\n",
      "#df.to_csv('Data/uniqueentries.csv', index=False, encoding='utf-8')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 20
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Now, we need to merge the unique data set into the full data set. "
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "fulldf = pd.read_csv('Data/full.csv', encoding='utf-8')\n",
      "fulldf['alchemy'] = ['null']*len(fulldf)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for i in fulldf.index:\n",
      "    fid = fulldf['id'][i]\n",
      "    alc = df[df['id'] == fid]['alchemy']\n",
      "    fulldf['alchemy'][i] = alc\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "fulldf.to_csv('Data/full.csv', index=False, encoding='utf-8')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#print p.run_method(comments, 'concepts')\n",
      "#print p.run_method(comments, 'keywords', {'keywordExtractMode':'strict'})\n",
      "#print p.run_method(comments, 'category')\n",
      "#print p.run_method(comments, 'sentiment')\n",
      "#print p.run_method(comments, 'entities')\n",
      "#print p.run_method(reddit_base, 'urlkeywords')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}