{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/tylerfolkman/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:19: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "from collections import defaultdict\n",
    "\n",
    "\n",
    "def text_to_dict(paragraph_array, d, candidates):\n",
    "    '''takes an array of text paragraphs from debate and returns dict \n",
    "    where key is person and value is list of text spoken by that candidate'''\n",
    "    # just a default speaker that won't end up in our returned data\n",
    "    # will get replaced when an actual speaker is found\n",
    "    speaker = \"<START>\"\n",
    "    for paragraph in paragraph_array:\n",
    "        words = paragraph.text.split(' ')\n",
    "        first_word = words[0]\n",
    "        # only new speaker when have SPEAKER: format\n",
    "        if first_word[-1] == \":\":\n",
    "            speaker = first_word[:-1]\n",
    "        # only keep candidates text\n",
    "        if speaker in candidates:\n",
    "            d[speaker].append(words[1:])\n",
    "    return d\n",
    "\n",
    "\n",
    "def process_url(url, speaker_dict, candidates):\n",
    "    # requests gets the source code from the url and extracts it as text\n",
    "    html = requests.get(url).text\n",
    "    # beautifulsoup is a library that takes in text source code and returns a structured format of that\n",
    "    # source code that you can more easily search and parse.\n",
    "    soup = BeautifulSoup(html, 'html5lib')\n",
    "    # get all the 'p' tags from the source with class = 'story-body-text'\n",
    "    # this was determined by looking at the source code\n",
    "    # the first and last paragraphs are intro and ending\n",
    "    paragraphs = soup('p', {'class': 'story-body-text'})[1:-1]\n",
    "    text_to_dict(paragraphs, speaker_dict, candidates)\n",
    "    \n",
    "    \n",
    "def process_url_list(urls, speaker_dict, candidates):\n",
    "    for url in urls:\n",
    "        process_url(url, speaker_dict, candidates)\n",
    "    \n",
    "    \n",
    "candidates = ['BUSH', 'TRUMP', 'RUBIO', 'CARSON', 'FIORINA', 'KASICH', 'CRUZ', 'PAUL',\n",
    "             'SANDERS', 'CLINTON', \"O’MALLEY\"]\n",
    "urls = ['http://www.nytimes.com/2015/11/11/us/politics/transcript-republican-presidential-debate.html',\n",
    "       'http://www.nytimes.com/2016/02/14/us/politics/transcript-of-the-republican-presidential-debate.html',\n",
    "       'http://www.nytimes.com/2016/01/15/us/politics/transcript-of-republican-presidential-debate.html',\n",
    "       'http://www.nytimes.com/2016/01/29/us/politics/republican-presidential-debate-transcript.html?_r=0',\n",
    "       'http://www.nytimes.com/2016/01/18/us/politics/transcript-of-the-democratic-presidential-debate.html',\n",
    "       'http://www.nytimes.com/2016/02/12/us/politics/transcript-of-the-democratic-presidential-debate-in-milwaukee.html',\n",
    "       'http://www.nytimes.com/2016/02/07/us/politics/transcript-of-the-republican-presidential-debate-in-new-hampshire.html',\n",
    "       'http://www.nytimes.com/2016/02/05/us/politics/transcript-of-the-democratic-presidential-debate.html',\n",
    "       'http://www.nytimes.com/2016/01/29/us/politics/republican-presidential-debate-preliminary-transcript.html',\n",
    "       'http://www.nytimes.com/2015/12/16/us/politics/transcript-main-republican-presidential-debate.html']\n",
    "speaker_dict = defaultdict(list)\n",
    "\n",
    "process_url_list(urls, speaker_dict, candidates)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def pargraphs_to_words(speaker_dict):\n",
    "    d = defaultdict(list)\n",
    "    for candidate, paragraphs in speaker_dict.items():\n",
    "        for paragraph in paragraphs:\n",
    "            for word in paragraph:\n",
    "                d[candidate].append(word)\n",
    "    return d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "candidate_words = pargraphs_to_words(speaker_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('candidate_words_dict.json', 'w') as f:\n",
    "    json.dump(candidate_words, f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}