{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/tylerfolkman/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:19: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal\n" ] } ], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "from collections import defaultdict\n", "\n", "\n", "def text_to_dict(paragraph_array, d, candidates):\n", " '''takes an array of text paragraphs from debate and returns dict \n", " where key is person and value is list of text spoken by that candidate'''\n", " # just a default speaker that won't end up in our returned data\n", " # will get replaced when an actual speaker is found\n", " speaker = \"\"\n", " for paragraph in paragraph_array:\n", " words = paragraph.text.split(' ')\n", " first_word = words[0]\n", " # only new speaker when have SPEAKER: format\n", " if first_word[-1] == \":\":\n", " speaker = first_word[:-1]\n", " # only keep candidates text\n", " if speaker in candidates:\n", " d[speaker].append(words[1:])\n", " return d\n", "\n", "\n", "def process_url(url, speaker_dict, candidates):\n", " # requests gets the source code from the url and extracts it as text\n", " html = requests.get(url).text\n", " # beautifulsoup is a library that takes in text source code and returns a structured format of that\n", " # source code that you can more easily search and parse.\n", " soup = BeautifulSoup(html, 'html5lib')\n", " # get all the 'p' tags from the source with class = 'story-body-text'\n", " # this was determined by looking at the source code\n", " # the first and last paragraphs are intro and ending\n", " paragraphs = soup('p', {'class': 'story-body-text'})[1:-1]\n", " text_to_dict(paragraphs, speaker_dict, candidates)\n", " \n", " \n", "def process_url_list(urls, speaker_dict, candidates):\n", " for url in urls:\n", " process_url(url, speaker_dict, candidates)\n", " \n", " \n", "candidates = ['BUSH', 'TRUMP', 'RUBIO', 'CARSON', 'FIORINA', 'KASICH', 'CRUZ', 'PAUL',\n", " 'SANDERS', 'CLINTON', \"O’MALLEY\"]\n", "urls = ['http://www.nytimes.com/2015/11/11/us/politics/transcript-republican-presidential-debate.html',\n", " 'http://www.nytimes.com/2016/02/14/us/politics/transcript-of-the-republican-presidential-debate.html',\n", " 'http://www.nytimes.com/2016/01/15/us/politics/transcript-of-republican-presidential-debate.html',\n", " 'http://www.nytimes.com/2016/01/29/us/politics/republican-presidential-debate-transcript.html?_r=0',\n", " 'http://www.nytimes.com/2016/01/18/us/politics/transcript-of-the-democratic-presidential-debate.html',\n", " 'http://www.nytimes.com/2016/02/12/us/politics/transcript-of-the-democratic-presidential-debate-in-milwaukee.html',\n", " 'http://www.nytimes.com/2016/02/07/us/politics/transcript-of-the-republican-presidential-debate-in-new-hampshire.html',\n", " 'http://www.nytimes.com/2016/02/05/us/politics/transcript-of-the-democratic-presidential-debate.html',\n", " 'http://www.nytimes.com/2016/01/29/us/politics/republican-presidential-debate-preliminary-transcript.html',\n", " 'http://www.nytimes.com/2015/12/16/us/politics/transcript-main-republican-presidential-debate.html']\n", "speaker_dict = defaultdict(list)\n", "\n", "process_url_list(urls, speaker_dict, candidates)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def pargraphs_to_words(speaker_dict):\n", " d = defaultdict(list)\n", " for candidate, paragraphs in speaker_dict.items():\n", " for paragraph in paragraphs:\n", " for word in paragraph:\n", " d[candidate].append(word)\n", " return d" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "candidate_words = pargraphs_to_words(speaker_dict)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import json\n", "\n", "with open('candidate_words_dict.json', 'w') as f:\n", " json.dump(candidate_words, f)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }