{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Topic-specific term associations through word representations\n",
    "## How do Democrats and Republicans talk different about jobs\n",
    "\n",
    "https://github.com/JasonKessler/scattertext\n",
    "\n",
    "Cite as:\n",
    "Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.\n",
    "\n",
    "Link to preprint: https://arxiv.org/abs/1703.00565\n",
    "\n",
    "`\n",
    "@article{kessler2017scattertext,\n",
    "  author    = {Kessler, Jason S.},\n",
    "  title     = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ},\n",
    "  booktitle = {ACL System Demonstrations},\n",
    "  year      = {2017},\n",
    "}\n",
    "`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>.container { width:98% !important; }</style>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "import scattertext as st\n",
    "from gensim.models import word2vec\n",
    "import re, io, itertools\n",
    "from pprint import pprint\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import spacy\n",
    "import os, pkgutil, json, urllib\n",
    "from urllib.request import urlopen\n",
    "from IPython.display import IFrame\n",
    "from IPython.core.display import display, HTML\n",
    "display(HTML(\"<style>.container { width:98% !important; }</style>\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nlp = spacy.load('en')\n",
    "# If this doesn't work, please uncomment the following line and use a regex-based parser instead\n",
    "#nlp = st.whitespace_nlp_with_sentences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load the 2012 Conventions Dataset\n",
    "### We'll limit the study to unigrams"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "convention_df = st.SampleCorpora.ConventionData2012.get_data()\n",
    "convention_df['parsed'] = convention_df.text.apply(nlp)\n",
    "corpus = (st.CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed')\n",
    "          .build()\n",
    "          .get_unigram_corpus())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Use Gensim to run Word2Vec on the corpus.\n",
    "### Word2Vec encodes each word in a dense K-dimensional vector space\n",
    "### Cosine distances between terms vectors correspond to semantic similarity "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('create', 0.9190447926521301),\n",
       " ('businesses', 0.8814688920974731),\n",
       " ('million', 0.8395127058029175),\n",
       " ('taxes', 0.8300786018371582),\n",
       " ('millions', 0.829835057258606),\n",
       " ('created', 0.8269357085227966),\n",
       " ('pay', 0.8228686451911926),\n",
       " ('families', 0.8117849826812744),\n",
       " ('lives', 0.8079125881195068),\n",
       " ('debt', 0.8053802847862244)]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = word2vec.Word2Vec(size=100, window=5, min_count=10, workers=4)\n",
    "model = st.Word2VecFromParsedCorpus(corpus, model).train(epochs=10000)\n",
    "model.wv.most_similar('jobs')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9677"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "corpus._df[corpus._parsed_col].apply(lambda x: len(list(x.sents))).sum()\n",
    "#model.corpus_count"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Draw the Scattertext by only coloring points that have are associated with a category (p < 0.05 via log-odds w/ prior)\n",
    "### The top Democratic and Republican terms are raked by their similarity to \"jobs\"\n",
    "#### Only the terms associated to a category are considered. \n",
    "### On the far right, the most similar terms, regardless of category association, are listed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"700\"\n",
       "            src=\"output/demo_similarity_gensim.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x110a99080>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target_term = 'jobs'\n",
    "\n",
    "html = st.word_similarity_explorer_gensim(corpus,\n",
    "                                          category='democrat',\n",
    "                                          category_name='Democratic',\n",
    "                                          not_category_name='Republican',\n",
    "                                          target_term=target_term,\n",
    "                                          minimum_term_frequency=5,\n",
    "                                          width_in_pixels=1000,\n",
    "                                          word2vec=model,\n",
    "                                          metadata=convention_df['speaker'])\n",
    "file_name = 'output/demo_similarity_gensim.html'\n",
    "open(file_name, 'wb').write(html.encode('utf-8'))\n",
    "IFrame(src=file_name, width = 1200, height=700)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Instead of using vectors trained on the Corpus, we can use the spaCy-provided word vectors trained on the Common Crawl Corpus.\n",
    "### These are trained on a lot more data, but aren't specific to the corpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1200\"\n",
       "            height=\"700\"\n",
       "            src=\"output/demo_similarity.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x13aac0080>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Note: this will fail if you did not use spaCy as your parser.\n",
    "html = st.word_similarity_explorer(corpus,\n",
    "                                   category='democrat',\n",
    "                                   category_name='Democratic',\n",
    "                                   not_category_name='Republican',\n",
    "                                   target_term='jobs',\n",
    "                                   minimum_term_frequency=5,\n",
    "                                   width_in_pixels=1000,\n",
    "                                   metadata=convention_df['speaker'])\n",
    "file_name = 'output/demo_similarity.html'\n",
    "open(file_name, 'wb').write(html.encode('utf-8'))\n",
    "IFrame(src=file_name, width = 1200, height=700)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [Root]",
   "language": "python",
   "name": "Python [Root]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}