{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "nbpresent": {
     "id": "8e9acba9-0fd2-4095-a173-551ffa6031d8"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>.container { width:98% !important; }</style>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import scattertext as st\n",
    "import re\n",
    "from pprint import pprint\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import spacy.en\n",
    "from IPython.display import IFrame\n",
    "from IPython.core.display import display, HTML\n",
    "display(HTML(\"<style>.container { width:98% !important; }</style>\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true,
    "nbpresent": {
     "id": "0ab1b935-081b-4ec6-9bff-a49d71bc34f0"
    }
   },
   "outputs": [],
   "source": [
    "nlp = spacy.en.English()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "c887d39c-0d8c-4796-9479-9bfa17122ef3"
    }
   },
   "source": [
    "# Parse debates and create plotting interface\n",
    "The function returns a Pandas data frame consisting of two columns, speaker and statement.  Speaker is the name of the speaker, given in all caps, and statement is the speech made during a particular turn.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "nbpresent": {
     "id": "28c99899-c109-4cfc-b023-45fdb2070e38"
    }
   },
   "outputs": [],
   "source": [
    "def debate_transcript_to_dataframe(fn, speakers):\n",
    "    lines = open(fn).read().split('\\n')\n",
    "    cur_speaker = None\n",
    "    speaker_start_re = re.compile(r'^([(]?[A-Z][A-Z][A-Z]+):?(.+)$')\n",
    "    transcript = []\n",
    "    cur_statement = ''\n",
    "    cur_speaker = None\n",
    "    for line in lines:\n",
    "        match = speaker_start_re.match(line)\n",
    "        if match:\n",
    "            if match.group(1).startswith('('):\n",
    "                continue\n",
    "            if cur_speaker is not None:\n",
    "                transcript.append({'speaker': cur_speaker, 'statement': cur_statement})\n",
    "            cur_speaker = match.group(1).strip()\n",
    "            cur_statement = match.group(2).strip() + '\\n'\n",
    "            for other_name in speakers:\n",
    "                if other_name+':' in cur_statement:\n",
    "                    cur_statement, other_statement = cur_statement.split(other_name)\n",
    "                    transcript.append({'speaker': cur_speaker, 'statement': cur_statement.strip()})\n",
    "                    transcript.append({'speaker': other_name, 'statement': other_statement.strip()})   \n",
    "        else:\n",
    "            cur_statement += line \n",
    "    df = pd.DataFrame(transcript)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read debates into Pandas data frames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "nbpresent": {
     "id": "fe53b41b-fd68-475a-9fc1-5f8464bbe938"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>speaker</th>\n",
       "      <th>statement</th>\n",
       "      <th>debate</th>\n",
       "      <th>party</th>\n",
       "      <th>speaker and debate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>QUIJANO</td>\n",
       "      <td>Good evening. From Longwood University in Farm...</td>\n",
       "      <td>VP</td>\n",
       "      <td>Moderator</td>\n",
       "      <td>QUIJANO VP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>QUIJANO</td>\n",
       "      <td>I'm Elaine Quijano, anchor at CBSN, and corres...</td>\n",
       "      <td>VP</td>\n",
       "      <td>Moderator</td>\n",
       "      <td>QUIJANO VP</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   speaker                                          statement debate  \\\n",
       "0  QUIJANO  Good evening. From Longwood University in Farm...     VP   \n",
       "1  QUIJANO  I'm Elaine Quijano, anchor at CBSN, and corres...     VP   \n",
       "\n",
       "       party speaker and debate  \n",
       "0  Moderator         QUIJANO VP  \n",
       "1  Moderator         QUIJANO VP  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parties = {'QUIJANO':'Moderator', \n",
    "           'KAINE':'Democratic', \n",
    "           'PENCE':'Republican', \n",
    "           'HOLT':'Moderator', \n",
    "           'CLINTON':'Democratic', \n",
    "           'TRUMP':'Republican',\n",
    "           'COOPER':'Moderator',\n",
    "           'RADDATZ':'Moderator',\n",
    "           'WALLACE':'Moderator'}\n",
    "\n",
    "debate_dfs = {}\n",
    "for info in [\n",
    "    {'debate': '1st', 'fn': 'presidential-debate-2016-09-26.txt', 'participants': ['TRUMP','CLINTON','HOLT']},\n",
    "    {'debate': 'VP', 'fn': 'vp-debate-2016-10-04.txt', 'participants': ['PENCE','KAINE','QUIJANO']},\n",
    "    {'debate': '2nd', 'fn': 'debate-2016-10-09-rush.txt', 'participants': ['TRUMP','CLINTON','COOPER','RADDATZ']},\n",
    "    {'debate': '3rd', 'fn': 'debate-2016-10-19.txt', 'participants': ['TRUMP','CLINTON','WALLACE']}]:\n",
    "    cur_df = debate_transcript_to_dataframe(info['fn'], info['participants'])\n",
    "    cur_df['debate'] = info['debate']\n",
    "    cur_df['party'] = cur_df['speaker'].apply(lambda x: parties[x])\n",
    "    cur_df['speaker and debate']=cur_df['speaker'].apply(lambda x: x + ' ' + info['debate'])\n",
    "    debate_dfs[info['debate']] = cur_df   \n",
    "df_all = pd.concat(debate_dfs.values())\n",
    "df_all.iloc[:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all.to_csv('presidential_debates_2016.csv.gz', compression='gzip', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "!cp presidential_debates_2016.csv.gz ../scattertext/scattertext/data/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Function to draw scatter plot in notebook. \n",
    "Creates a chart from text in a data frame, `df`.  The `category` and `other_category` parameters are the names of the columns we'll compare.  The `category_col` is the column in `df` that contains document categories, and contains `category` and `other_category`.  For example, if `category` is \"TRUMP\", then `category_col` would be \"speaker\". `extra` is append to the file name of the html file produced. \n",
    "\n",
    "We'll look at the rest of the optional parameters later.\n",
    "\n",
    "The function returns an iFrame containing containing the HTML visualization, and as a side-effect writes the visualization to an html file, named `category.lower() + '-' + other_category.lower() + extra + '.html'`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "nbpresent": {
     "id": "0080cb63-dc1a-4f70-aeca-2d8e626b6d95"
    }
   },
   "outputs": [],
   "source": [
    "def draw_corpus(df, corpus, category, other_category, category_col, extra='', scores=None, singleScoreMode=False, \n",
    "                minimum_term_frequency=2, grey_zero_scores=False, sort_by_dist=True):\n",
    "    html = st.produce_scattertext_explorer(corpus, \n",
    "                                           category=category, \n",
    "                                           category_name=category.lower(), \n",
    "                                           not_category_name=other_category.lower(),\n",
    "                                           pmi_filter_thresold=2,\n",
    "                                           minimum_term_frequency=minimum_term_frequency,\n",
    "                                           metadata=df['speaker and debate'],\n",
    "                                           scores=scores,\n",
    "                                           width_in_pixels=1000,\n",
    "                                           grey_zero_scores=grey_zero_scores,\n",
    "                                           singleScoreMode=singleScoreMode,\n",
    "                                           sort_by_dist=sort_by_dist)\n",
    "    file_name = category.lower() + '-' + other_category.lower() + extra + '.html'\n",
    "    open(file_name, 'wb').write(html.encode('utf-8'))\n",
    "    return IFrame(src=file_name, width = 1200, height=1000)\n",
    "\n",
    "def draw_plot(df, category, other_category, category_col, extra=''):\n",
    "    # Scattertext can only do a one column vs. all analysis.  We're excluding any other speakrs\n",
    "    category_vs_other_df = df[(df[category_col] == category) | (df[category_col] == other_category)]\n",
    "    corpus = st.CorpusFromPandas(category_vs_other_df, \n",
    "                                 category_col = category_col, \n",
    "                                 text_col = 'statement',\n",
    "                                 nlp = nlp).build()\n",
    "    return draw_corpus(category_vs_other_df,  corpus, category, other_category, category_col, extra=extra)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "d7ed314d-13ff-4031-8e6b-e10f243571f7"
    }
   },
   "source": [
    "# Find the top words used by the candidates in the 3rd debate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "nbpresent": {
     "id": "4158f339-c980-4471-be45-f847fcfcf523"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Trump top terms\n",
      "Index(['hillary', 'bad', 'she wants', 'you have', 'the border', 'and she',\n",
      "       'justices', 'signed', 'percent', 'strong', 'outsmarted', 'a disaster',\n",
      "       'she 's', 'deals', 'no idea', 'have no', 'start', 'appoint', 'pouring',\n",
      "       'the baby'],\n",
      "      dtype='object', name='term')\n",
      "Clinton top terms\n",
      "Index(['women', 'kind of', 'against', 'that is', 'work', 'stand',\n",
      "       'undocumented', 'also', 'most', 'guns', 'stand up', 'the debt',\n",
      "       'the kind', 'rights', 'against it', 'v.', 'million', 'families',\n",
      "       'new jobs', 'should be'],\n",
      "      dtype='object', name='term')\n"
     ]
    }
   ],
   "source": [
    "\n",
    "category, other_category, category_col = 'CLINTON', 'TRUMP', 'speaker'\n",
    "debate_3 = st.CorpusFromPandas(data_frame = debate_dfs['3rd'][( debate_dfs['3rd'][category_col] == category) \n",
    "                                                              | ( debate_dfs['3rd'][category_col] == other_category)], \n",
    "                               category_col = category_col, \n",
    "                               text_col = 'statement',\n",
    "                               nlp = nlp).build()\n",
    "\n",
    "term_df = debate_3.get_term_freq_df()\n",
    "term_df['Trump'] = debate_3.get_scaled_f_scores('TRUMP')\n",
    "term_df['Clinton'] = debate_3.get_scaled_f_scores('CLINTON')\n",
    "\n",
    "print('Trump top terms')\n",
    "print(term_df.sort_values(by='Trump', ascending=False).iloc[:20].index)\n",
    "print('Clinton top terms')\n",
    "print(term_df.sort_values(by='Clinton', ascending=False).iloc[:20].index)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#  Clinton vs. Trump word use"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "nbpresent": {
     "id": "ee828df4-fbe8-468f-931e-185017eb74da"
    },
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"1000\"\n",
       "            src=\"clinton-trump.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x119bd6668>"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "draw_plot(df_all, 'CLINTON', 'TRUMP', 'speaker')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"1000\"\n",
       "            src=\"kaine-pence.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x118d5ca20>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "draw_plot(df_all, 'KAINE', 'PENCE', 'speaker')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"1000\"\n",
       "            src=\"democratic-republican.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x119a2c828>"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "draw_plot(df_all, 'Democratic', 'Republican', 'party')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualize LDA topic model of the debates"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## First, create a corpus of all the 2016 debates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_dem_rep = df_all[df_all.party.isin({'Democratic', 'Republican'})]\n",
    "corpus = st.CorpusFromPandas(df_dem_rep, \n",
    "                             category_col = 'party', \n",
    "                             text_col = 'statement',\n",
    "                             nlp = nlp).build()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Filter out bigrams and stopwords from the corpus, making a new one called `corpus_uni_stop`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus_uni_stop = corpus.get_stoplisted_unigram_corpus()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train two, party-specifc topic models and one general model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import LatentDirichletAllocation\n",
    "lda_models = {}\n",
    "for party in ['Republican', 'Democratic', 'General']:\n",
    "    #subset the term-document matrix to only speech from one paraty or aanother\n",
    "    if party != 'General':\n",
    "        X = corpus_uni_stop._X[corpus_uni_stop._y == corpus_uni_stop.get_categories().index(party),:]\n",
    "    else:\n",
    "        X = corpus_uni_stop._X\n",
    "    lda_models[party] = (LatentDirichletAllocation(n_topics=20, \n",
    "                                                   max_iter=60,\n",
    "                                                   learning_method='online',\n",
    "                                                   learning_offset=50.,\n",
    "                                                   random_state=0)\n",
    "                         .fit(X))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Some General Topics\n",
      "Topic #0:\n",
      "concerned, office, troubling, installers, deeply, visited, far, threat, man, met\n",
      "Topic #1:\n",
      "chicago, puppet, shootings, 4,000, guns, 1st, january, terrible, 2014, shared\n",
      "Topic #2:\n",
      "trashing, muslims, syrians, pick, bookstore, tomorrow, announced, book, called, stronger\n",
      "\n",
      "Some Republican Topics\n",
      "Topic #0:\n",
      "years, look, 30, entitled, miss, imagine, deductions, half, debt, number\n",
      "Topic #1:\n",
      "nonsense, oh, telling, puppet, cybersecurity, surge, speak, seventh, circuit, respectful\n",
      "Topic #2:\n",
      "slowest, recovery, great, economic, depression, $, audit, release, returns, audited\n",
      "\n",
      "Some Democratic Topics\n",
      "Topic #0:\n",
      "security, going, social, trade, read, solvent, book, energy, enforce, voted\n",
      "Topic #1:\n",
      "apologize, crisis, collapse, fact, worst, shared, minutes, dramatically, improved, 2014\n",
      "Topic #2:\n",
      "think, donald, people, prepared, yes, undocumented, promised, privatize, said, jobs\n",
      "\n"
     ]
    }
   ],
   "source": [
    "def top_words_in_topic(scores, corpus, n_top_words):\n",
    "    return [corpus._term_idx_store.getval(i) for i \n",
    "            in scores.argsort()[:-n_top_words - 1:-1]]\n",
    "\n",
    "def print_some_topics(model):\n",
    "    for topic_idx, topic in list(enumerate(model.components_))[:3]:\n",
    "        print(\"Topic #%d:\" % topic_idx)\n",
    "        print(', '.join(top_words_in_topic(model.components_[topic_idx], corpus_uni_stop, 10)))\n",
    "        \n",
    "    print()\n",
    "print(\"Some General Topics\")\n",
    "print_some_topics(lda_models['General'])\n",
    "print(\"Some Republican Topics\")\n",
    "print_some_topics(lda_models['Republican'])\n",
    "print(\"Some Democratic Topics\")\n",
    "print_some_topics(lda_models['Democratic'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Top terms in Dem topic 1 ['actually', 'admit', 'completely', 'polished', 'values', 'antithetical', 'jeffersonian', 'bit', 'borders', 'open']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"1000\"\n",
       "            src=\"democratic-republican_dem_topic_1.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x118d55630>"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "topic_idx = 1\n",
    "party='Democratic'\n",
    "print('Top terms in Dem topic %s' % topic_idx, \n",
    "      top_words_in_topic(lda_models[party].components_[topic_idx], corpus_uni_stop, 10))\n",
    "draw_corpus(df_dem_rep, \n",
    "            corpus_uni_stop, \n",
    "            'Democratic', \n",
    "            'Republican', \n",
    "            'party', \n",
    "            extra='_dem_topic_%s'%(topic_idx), \n",
    "            scores = lda_models[party].components_[topic_idx],\n",
    "            minimum_term_frequency=1,\n",
    "            singleScoreMode=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Top terms in Rep topic 7 ['doubt', 'jail', 'debunked', 'ugh', 'taunting', 'yeah', 'doing', 'just', 'defective', 'lester']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"1000\"\n",
       "            src=\"democratic-republican_rep_topic_7.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x119089c88>"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "topic_idx = 7\n",
    "party='Republican'\n",
    "print('Top terms in Rep topic %s' % topic_idx, \n",
    "      top_words_in_topic(lda_models[party].components_[topic_idx], corpus_uni_stop, 10))\n",
    "draw_corpus(df_dem_rep, \n",
    "            corpus_uni_stop, \n",
    "            'Democratic', \n",
    "            'Republican', \n",
    "            'party', \n",
    "            extra='_rep_topic_%s'%(topic_idx), \n",
    "            scores = lda_models[party].components_[topic_idx],\n",
    "            minimum_term_frequency=1,\n",
    "            singleScoreMode=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Top terms in general topic 2 ['shared', 'bet', 'u.s.', 'program', 'suspended', 'citizen', 'announced', 'website', 'think', 'actually']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"1000\"\n",
       "            src=\"democratic-republican_gen_topic_2.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x11d59d2e8>"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "topic_idx = 2\n",
    "party='General'\n",
    "print('Top terms in general topic %s' % topic_idx, \n",
    "      top_words_in_topic(lda_models[party].components_[topic_idx], corpus_uni_stop, 10))\n",
    "draw_corpus(df_dem_rep, \n",
    "            corpus_uni_stop, \n",
    "            'Democratic', \n",
    "            'Republican', \n",
    "            'party', \n",
    "            extra='_gen_topic_%s'%(topic_idx), \n",
    "            scores = lda_models[party].components_[topic_idx],\n",
    "            minimum_term_frequency=1,\n",
    "            singleScoreMode=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "39cd7772-3198-4909-bec9-1db20561b287"
    }
   },
   "source": [
    "## Visualizing Word2Vec term similarity "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "scrolled": false
   },
   "source": [
    "### Score each term in corpus against the word \"job\".  SpaCy includes 300-dimensional word vectors and a cosine-similarity function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Terms that are most similar to \"job\"\n",
      "['job', 'jobs', 'position', 'role', 'work', 'career', 'duty', 'responsibilities', 'tenure', 'contract']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"1000\"\n",
       "            src=\"democratic-republican_embedding_job.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x122b89b38>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_term_text = 'job'\n",
    "base_term = nlp(base_term_text)\n",
    "scores=np.array([base_term.similarity(nlp(tok)) \n",
    "                 for tok \n",
    "                 in corpus_uni_stop._term_idx_store._i2val])\n",
    "\n",
    "\n",
    "\n",
    "print('Terms that are most similar to \"%s\"' % base_term)\n",
    "print(top_words_in_topic(scores, corpus_uni_stop, 10))\n",
    "draw_corpus(df_dem_rep, \n",
    "            corpus_uni_stop, \n",
    "            'Democratic', \n",
    "            'Republican', \n",
    "            'party', \n",
    "            extra = '_embedding_' + base_term_text, \n",
    "            scores = scores,\n",
    "            minimum_term_frequency=1,\n",
    "            singleScoreMode=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'nlp' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-3-2ab6b232bf9f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mbase_term\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnlp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wealth'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m scores=np.array([base_term.similarity(nlp(tok)) \n\u001b[1;32m      3\u001b[0m                  \u001b[0;32mfor\u001b[0m \u001b[0mtok\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m                  in corpus_uni_stop._term_idx_store._i2val])\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'nlp' is not defined"
     ]
    }
   ],
   "source": [
    "base_term = nlp('wealth')\n",
    "scores=np.array([base_term.similarity(nlp(tok)) \n",
    "                 for tok \n",
    "                 in corpus_uni_stop._term_idx_store._i2val])\n",
    "\n",
    "print('Terms that are most similar to \"%s\"' % base_term)\n",
    "print(top_words_in_topic(scores, corpus_uni_stop, 10))\n",
    "draw_corpus(df_dem_rep, \n",
    "            corpus_uni_stop, \n",
    "            'Democratic', \n",
    "            'Republican', \n",
    "            'party', \n",
    "            extra='_dem_topic_%s'%(topic_idx), \n",
    "            scores = scores,\n",
    "            minimum_term_frequency=1,\n",
    "            singleScoreMode=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Let's use Scattertext to inspect the coefficients from Lasso-logistic regression\n",
    "* We can see the accuracies used "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/kesslej/anaconda3/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n",
      "  ConvergenceWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Terms have the highest lasso coefficients for predicting Democrats are\n",
      "['chief', 'kind', 'donald', 'mistake', 'intelligence', 'worked', 'debate', 'vladimir', 'good', 'book']\n",
      "Terms have the highest lasso coefficients for predicting Republicans are\n",
      "['tell', 'clinton', 'mean', 'tremendous', 'kaine', 'change', 'country', 'stop', 'happy', 'respond']\n",
      "Cross-validated classification accuracy 0.627182044888\n",
      "Baseline (class-conditional) accuracy 0.56608478803\n"
     ]
    }
   ],
   "source": [
    "l1scores, acc, baseline = corpus_uni_stop.get_logistic_regression_coefs_l1('Democratic')\n",
    "print('Terms have the highest lasso coefficients for predicting Democrats are')\n",
    "print(top_words_in_topic(l1scores, corpus_uni_stop, 10))\n",
    "print('Terms have the highest lasso coefficients for predicting Republicans are')\n",
    "print(top_words_in_topic(-1*l1scores, corpus_uni_stop,  10))\n",
    "print('Cross-validated classification accuracy', acc)\n",
    "print('Baseline (class-conditional) accuracy', baseline)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"1000\"\n",
       "            src=\"democratic-republicanlasso.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x118c2eb38>"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#draw_corpus(df_dem_rep,  corpus_uni_stop, 'Democratic', 'Republican', 'party', extra='lasso')\n",
    "draw_corpus(df_dem_rep, \n",
    "            corpus_uni_stop, \n",
    "            'Democratic', \n",
    "            'Republican', \n",
    "            'party', \n",
    "            extra='lasso', \n",
    "            scores = l1scores,\n",
    "            minimum_term_frequency=1,\n",
    "            sort_by_dist = False,\n",
    "            grey_zero_scores = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "16e5e993-5fcd-4d91-8b32-600843c3c863"
    }
   },
   "source": [
    "# Comare Clinton and Trump's 1st debate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "nbpresent": {
     "id": "e17d391f-ab9e-4e7b-afca-ab8e97f104fc"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"1000\"\n",
       "            src=\"clinton-trump1st.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x118d7e860>"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "draw_plot(debate_dfs['1st'], 'CLINTON', 'TRUMP', 'speaker', '1st')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "957ecfae-dc13-4a29-a353-5afb9a3599fb"
    }
   },
   "source": [
    "# Compare Trump to Pence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "nbpresent": {
     "id": "c59e1e7f-d845-4af4-83ef-9bd238875ab6"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"1000\"\n",
       "            src=\"trump-pence_trumppence.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x118fae128>"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "draw_plot(df_all, 'TRUMP', 'PENCE', 'speaker', '_trumppence')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "c2b9b4d9-5680-4b2f-83a6-1bbfb8685a8e"
    }
   },
   "source": [
    "# Compare the 1st to the 2nd debate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "nbpresent": {
     "id": "4a929431-8c4d-493f-b969-d05bf5b21e4c"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"1000\"\n",
       "            src=\"1st-2nd_1st_vs_2nd.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x185d16048>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "draw_plot(df_all, '1st', '2nd', 'debate', '_1st_vs_2nd')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [Root]",
   "language": "python",
   "name": "Python [Root]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}