{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "%matplotlib inline\n", "\n", "import time\n", "import pandas as pd\n", "import numpy as np\n", "import json\n", "import os\n", "import urllib\n", "import urllib2\n", "import matplotlib.pyplot as plt\n", "\n", "import nltk\n", "from nltk.util import ngrams\n", "from nltk.collocations import *\n", "from nltk.corpus import stopwords\n", "\n", "import pandas as pd\n", "import math\n", "from scipy.stats import pearsonr\n", "\n", "user_agent = (\"Project for Data Science class v1.0\" \" /u/Valedra\" \" https://github.com/jaysayre/intelligentdolphins\")\n", "\n", "\n", "def json_extract(baseurl, headrs=None, params=None):\n", " '''\n", " Helper function to download and read json data. Takes in explanatory headers and returns json dict.\n", " '''\n", " if params != None:\n", " form = urllib.urlencode(params)\n", " url = baseurl+form\n", " else:\n", " url = baseurl\n", " \n", " if headrs != None:\n", " request = urllib2.Request(url, headers=headrs)\n", " else: \n", " request = urllib2.Request(url)\n", " return json.loads(urllib2.urlopen(request).read())\n", "\n", "def return_grams(sentence, n=[1, 3], minlength=3):\n", " gramslist = []\n", " mysentencetokens_sw= nltk.word_tokenize(sentence)\n", " mysentencetokens = [token for token in mysentencetokens_sw if (not token in stopwords.words('english')) and len(token) >= minlength]\n", " for j in range(n[0], n[1]+1):\n", " somegrams = ngrams(mysentencetokens, j)\n", " for grams in somegrams:\n", " gramslist.append(' '.join(grams))\n", " return gramslist \n", "\n", "def search_keyword(title, maxscores, user_agent, n=[3,3], postid='bhjfb', sort_call='relevance', t='all', subreddit=None, api_call_limit=100):\n", " scores = {}\n", " header = {'User-agent': user_agent}\n", " for term in return_grams(title, [n[0],n[1]]): \n", " post_params = {'q':term, 'sort': sort_call, 't':t, 'limit':api_call_limit}\n", " if subreddit == None:\n", " reddit_base = 'http://www.reddit.com/r/search/search.json?' # If we want to search all of reddit\n", " else:\n", " reddit_base = 'http://www.reddit.com/r/%s/search.json?' % subreddit\n", " post_params.update ({'restrict_sr':'on'})\n", " \n", " #Makes sure maxscores isn't a float!\n", " maxscores = int(maxscores) \n", " api_call_limit = int(api_call_limit)\n", " \n", " #Since reddit only provides <= 100 calls at a time, looks at n requested and splits it up into different requests\n", " if maxscores%api_call_limit != 0:\n", " remainder = maxscores%api_call_limit\n", " num = (maxscores/api_call_limit) +1\n", " else:\n", " num = maxscores/api_call_limit\n", " remainder = api_call_limit\n", " \n", " #Makes an api call for all n entries based on the api call limit\n", " for i in range(num):\n", " if i == 0:\n", " jsondata = json_extract(reddit_base, header, post_params)\n", " tostartfrom = jsondata['data']['after']\n", " for item in jsondata['data']['children']:\n", " if item['data']['score'] != 0:\n", " scores.update({item['data']['id'] : item['data']['score']})\n", " elif i == num - 1:\n", " post_params.update({'limit': remainder, 'after': tostartfrom}) #Indicates the post after we wish to call from\n", " jsondata = json_extract(reddit_base, header, post_params)\n", " for item in jsondata['data']['children']:\n", " if item['data']['score'] != 0:\n", " scores.update({item['data']['id'] : item['data']['score']})\n", " else: \n", " post_params.update({'after': tostartfrom}) \n", " jsondata = json_extract(reddit_base, header, post_params)\n", " tostartfrom = jsondata['data']['after']\n", " for item in jsondata['data']['children']:\n", " if item['data']['score'] != 0:\n", " scores.update({item['data']['id'] : item['data']['score']})\n", " try:\n", " scores.pop(postid)\n", " except:\n", " pass\n", " \n", " return scores.values()\n", "\n", "\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 27 }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this file we tried to generate trigrams in titles. \n", "\n", "Based on those we searched reddit and stored the scores of the search results. \n", "\n", "If those were zero, we tried the same thing using bigrams. \n", "\n", "We were hoping that the scores in search results would tell us anything about the score of the post we were looking at.\n", "\n", "\n", "\n", "