{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import json\n", "import os\n", "import urllib\n", "import urllib2\n", "import numpy as np\n", "import unicodedata\n", "from myalchemy import MyAlchemy\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "#Since we only get so many Alchemy API calls, might as well not call\n", "#Alchemy on duplicate posts between subreddits. We'll merge these later......\n", "#df = pd.read_csv('Data/full.csv', encoding='utf-8')\n", "\n", "#print \"Original size of data set is\", len(df)\n", "#df = df.drop_duplicates('id') # We only want unique post id entries, not to waste alchemy calls\n", "#print \"Size of data set with only unique posts is\", len(df)\n", "#subs = list(df['subreddit'].unique())\n", "#dflen = len(df)\n", "#df['alchemy'] = ['null']*dflen" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "markdown", "metadata": {}, "source": [ "If you are re-running the program, uncomment out the cell block above and simply use the cell block below instead. " ] }, { "cell_type": "code", "collapsed": false, "input": [ "#df = pd.read_csv('Data/uniqueentries.csv', encoding='utf-8')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "file_dir = \"Data/combinedcomments/\"\n", "\n", "path, dirs, files = os.walk(file_dir).next()\n", "csvfiles = [file_dir + i for i in files if \".csv\" in i ] #Builds a list with .csv files\n", "csvfiles.sort()\n", "#csvfiles" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "def check_null(x):\n", " try:\n", " np.isnan(x)\n", " return False\n", " except:\n", " return True" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "def alchemy_comments(df, start, apikey, csvfiles, end=len(df)):\n", " '''\n", " will run the alchemy keyword annalysis on the comment files\n", " '''\n", " p = MyAlchemy(apikey)\n", " dfids = list(df.index)\n", " for i in range(start, end):\n", " subrequest = df['subreddit'][dfids[i]] \n", " commentfile = ''\n", " for comb in csvfiles:\n", " if subrequest in comb:\n", " commentfile = comb\n", " commentdf = pd.read_csv(commentfile, encoding='utf-8')\n", " commentdf = commentdf.drop('type',1)\n", " commentdf = commentdf.drop_duplicates()\n", " commentdf = commentdf[commentdf['comment'].apply(lambda x: check_null(x))]\n", " commentdf['comment'] = commentdf['comment'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))\n", " commentdf = commentdf[commentdf['post'] == df['id'][dfids[i]]]\n", " comments = list(commentdf['comment'])\n", " # If we want to add the title to the alchemy call\n", " comments.append(df['title'][dfids[i]])\n", " # If we want to add the self text to the alchemy call \n", " if check_null(df['selftext'][dfids[i]]):\n", " comments.append(df['selftext'][dfids[i]])\n", " \n", " # Both joining the comments and sending alchemy calls can be problematic \n", " try:\n", " comments = ' '.join(comments)\n", " if len(comments) > 8000:\n", " comments = comments[0:7999]\n", " except:\n", " print \"Comment join error\", comments\n", " \n", " # I'm not sure what Alchemy does once you reach the cap... so you might want to check if null or something here.\n", " try:\n", " df['alchemy'][dfids[i]] = p.run_method(comments, 'keywords', {'keywordExtractMode':'strict'})\n", " except:\n", " print \"Alchemy error\", df['id'][dfids[i]]\n", " \n", " return df\n", " " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "markdown", "metadata": {}, "source": [ "This section calls the alchemy keyword grabber for each post in the unique data set." ] }, { "cell_type": "code", "collapsed": false, "input": [ "#Alchemy keys\n", "apikey1 = \"dcac82649daaa2627ee783b25779cfaed4af0067\" #Jay's key\n", "apikey2 = \"e945cef59338f9e8e7bc962badde170e623fb7e5\" #Basti's key\n", "apikey3 = \"cb736ca44e57cd6764b70ec86886f4fce8f6a68d\" #Serguei's Key" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Each alchemy key only works for 1000 calls so this part must be done by multiple people over many days." ] }, { "cell_type": "code", "collapsed": false, "input": [ "#df = alchemy_comments(df, 25000, apikey2, csvfiles, end=25992)\n", "#df.to_csv('Data/uniqueentries.csv', index=False, encoding='utf-8')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 20 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, we need to merge the unique data set into the full data set. " ] }, { "cell_type": "code", "collapsed": false, "input": [ "fulldf = pd.read_csv('Data/full.csv', encoding='utf-8')\n", "fulldf['alchemy'] = ['null']*len(fulldf)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "for i in fulldf.index:\n", " fid = fulldf['id'][i]\n", " alc = df[df['id'] == fid]['alchemy']\n", " fulldf['alchemy'][i] = alc\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "fulldf.to_csv('Data/full.csv', index=False, encoding='utf-8')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "#print p.run_method(comments, 'concepts')\n", "#print p.run_method(comments, 'keywords', {'keywordExtractMode':'strict'})\n", "#print p.run_method(comments, 'category')\n", "#print p.run_method(comments, 'sentiment')\n", "#print p.run_method(comments, 'entities')\n", "#print p.run_method(reddit_base, 'urlkeywords')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }