{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/kesslej/anaconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<style>.container { width:90% !important; }</style>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import scattertext as ST\n",
    "import tarfile, urllib, io\n",
    "import pandas as pd\n",
    "from IPython.display import IFrame\n",
    "from IPython.core.display import display, HTML\n",
    "display(HTML(\"<style>.container { width:90% !important; }</style>\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "'''From Bo Pang's website: https://www.cs.cornell.edu/people/pabo/movie-review-data/\n",
    "\n",
    "Data from:\n",
    "A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization \n",
    "Based on Minimum Cuts'', Proceedings of the ACL, 2004\n",
    "'''\n",
    "SUBJECTIVITY_URL = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz'\n",
    "data = io.BytesIO(urllib.request.urlopen(SUBJECTIVITY_URL).read())\n",
    "tarball = tarfile.open(fileobj=data, mode = 'r:gz')\n",
    "readme = tarball.extractfile('subjdata.README.1.0').read()\n",
    "quote = tarball.extractfile('quote.tok.gt9.5000').read()\n",
    "plot = tarball.extractfile('plot.tok.gt9.5000').read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['smart and alert , thirteen conversations about one thing is a small gem . ',\n",
       " 'color , musical bounce and warm seas lapping on island shores . and just enough science to send you home thinking . ',\n",
       " 'it is not a mass-market entertainment but an uncompromising attempt by one artist to think about another . ']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Examples of subjective sentences in corpus\n",
    "quote.decode('utf-8', errors='ignore').split('\\n')[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "'''Construct subjective vs. objective pandas dataframe, \n",
    "treating review quotes as subjective, and plot points as objective.\n",
    "'''\n",
    "df = pd.DataFrame(\n",
    "    [{'text': text.strip(), 'label': 'subjective'} for text \n",
    "     in quote.decode('utf-8', errors='ignore').split('\\n')] \n",
    "    + [{'text': text.strip(), 'label': 'objective'} for text \n",
    "       in plot.decode('utf-8', errors='ignore').split('\\n')]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "'''Convert Pandas dataframe to a term-document matrix, indicating\n",
    "the category column is \"label\" and the text column name is \"text\".'''\n",
    "\n",
    "\n",
    "term_doc_mat = ST.TermDocMatrixFromPandas(data_frame = df, \n",
    "                                          category_col = 'label', \n",
    "                                          text_col = 'text',\n",
    "                                          # Note: use nlp=spacy.en.English() for text that's not pre-tokenized\n",
    "                                          nlp = ST.fast_but_crap_nlp \n",
    "                                         ).build()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "AssertionError",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-9-15a4f72afaba>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     10\u001b[0m                                    \u001b[0mpmi_filter_thresold\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     11\u001b[0m                                    \u001b[0mminimum_term_frequency\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m                                    width_in_pixels=1000)\n\u001b[0m\u001b[1;32m     13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;31m# Hack to display HTML with D3 in Jupyter Notebook\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Users/kesslej/anaconda3/lib/python3.5/site-packages/scattertext-0.0.1.9.8-py3.5.egg/scattertext/__init__.py\u001b[0m in \u001b[0;36mproduce_scattertext_explorer\u001b[0;34m(corpus, category, category_name, not_category_name, protocol, pmi_filter_thresold, minimum_term_frequency, max_terms, filter_unigrams, height_in_pixels, width_in_pixels, max_snippets, max_docs_per_category, metadata, scores, singleScoreMode, use_full_doc, term_ranker)\u001b[0m\n\u001b[1;32m    149\u001b[0m                                                       \u001b[0mfilter_unigrams\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfilter_unigrams\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    150\u001b[0m                                                       \u001b[0mmax_terms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmax_terms\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 151\u001b[0;31m \t                                              term_ranker=term_ranker)\n\u001b[0m\u001b[1;32m    152\u001b[0m \tscatter_chart_data = scatter_chart_explorer.to_dict(category=category,\n\u001b[1;32m    153\u001b[0m                                                             \u001b[0mcategory_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcategory_name\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Users/kesslej/anaconda3/lib/python3.5/site-packages/scattertext-0.0.1.9.8-py3.5.egg/scattertext/ScatterChartExplorer.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, corpus, **kwargs)\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m \t\t'''\n\u001b[0;32m---> 15\u001b[0;31m                 \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCorpus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     16\u001b[0m                 \u001b[0mScatterChart\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m  \u001b[0mcorpus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAssertionError\u001b[0m: "
     ]
    }
   ],
   "source": [
    "'''\n",
    "Filter out bigrams with PMI < 3, and unigrams and bigrams that occur less than 20 times.  \n",
    "The variable html is a string containing the HTML that makes up the scattertext visualization\n",
    "'''\n",
    "html = ST.produce_scattertext_html(term_doc_mat, \n",
    "                                   category='subjective', \n",
    "                                   category_name='Subjective', \n",
    "                                   not_category_name='Objective',\n",
    "                                   protocol='https',\n",
    "                                   pmi_filter_thresold=3,\n",
    "                                   minimum_term_frequency=20,\n",
    "                                   width_in_pixels=1000)\n",
    "\n",
    "# Hack to display HTML with D3 in Jupyter Notebook\n",
    "open('subj_obj_scatter.html', 'wb').write(html.encode('utf-8'))\n",
    "IFrame(src='subj_obj_scatter.html', width = 1200, height=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>objective freq</th>\n",
       "      <th>subjective freq</th>\n",
       "      <th>Subjective Score</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>term</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>movie that</th>\n",
       "      <td>0</td>\n",
       "      <td>75</td>\n",
       "      <td>0.803250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>entertaining</th>\n",
       "      <td>2</td>\n",
       "      <td>73</td>\n",
       "      <td>0.771629</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>film s</th>\n",
       "      <td>2</td>\n",
       "      <td>69</td>\n",
       "      <td>0.767533</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>but it</th>\n",
       "      <td>6</td>\n",
       "      <td>157</td>\n",
       "      <td>0.766663</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>i</th>\n",
       "      <td>13</td>\n",
       "      <td>275</td>\n",
       "      <td>0.755910</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>interesting</th>\n",
       "      <td>3</td>\n",
       "      <td>70</td>\n",
       "      <td>0.752203</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>film that</th>\n",
       "      <td>4</td>\n",
       "      <td>77</td>\n",
       "      <td>0.744846</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>performances</th>\n",
       "      <td>5</td>\n",
       "      <td>89</td>\n",
       "      <td>0.742972</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>of its</th>\n",
       "      <td>6</td>\n",
       "      <td>103</td>\n",
       "      <td>0.742011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>in its</th>\n",
       "      <td>5</td>\n",
       "      <td>84</td>\n",
       "      <td>0.737945</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>me</th>\n",
       "      <td>2</td>\n",
       "      <td>51</td>\n",
       "      <td>0.737812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>script</th>\n",
       "      <td>4</td>\n",
       "      <td>71</td>\n",
       "      <td>0.736981</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>movie is</th>\n",
       "      <td>5</td>\n",
       "      <td>83</td>\n",
       "      <td>0.736840</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>if you</th>\n",
       "      <td>6</td>\n",
       "      <td>96</td>\n",
       "      <td>0.736319</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fascinating</th>\n",
       "      <td>2</td>\n",
       "      <td>48</td>\n",
       "      <td>0.730420</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cinematic</th>\n",
       "      <td>2</td>\n",
       "      <td>47</td>\n",
       "      <td>0.727758</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>funny</th>\n",
       "      <td>9</td>\n",
       "      <td>126</td>\n",
       "      <td>0.726650</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>laughs</th>\n",
       "      <td>0</td>\n",
       "      <td>30</td>\n",
       "      <td>0.725776</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>movie s</th>\n",
       "      <td>0</td>\n",
       "      <td>30</td>\n",
       "      <td>0.725776</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>you re</th>\n",
       "      <td>4</td>\n",
       "      <td>64</td>\n",
       "      <td>0.725331</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              objective freq  subjective freq  Subjective Score\n",
       "term                                                           \n",
       "movie that                 0               75          0.803250\n",
       "entertaining               2               73          0.771629\n",
       "film s                     2               69          0.767533\n",
       "but it                     6              157          0.766663\n",
       "i                         13              275          0.755910\n",
       "interesting                3               70          0.752203\n",
       "film that                  4               77          0.744846\n",
       "performances               5               89          0.742972\n",
       "of its                     6              103          0.742011\n",
       "in its                     5               84          0.737945\n",
       "me                         2               51          0.737812\n",
       "script                     4               71          0.736981\n",
       "movie is                   5               83          0.736840\n",
       "if you                     6               96          0.736319\n",
       "fascinating                2               48          0.730420\n",
       "cinematic                  2               47          0.727758\n",
       "funny                      9              126          0.726650\n",
       "laughs                     0               30          0.725776\n",
       "movie s                    0               30          0.725776\n",
       "you re                     4               64          0.725331"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "''' Display top 20 terms that are characteristic of a subjective document-label and their frequencies.\n",
    "'''\n",
    "term_freq_df = term_doc_mat.get_term_freq_df()\n",
    "term_freq_df['Subjective Score'] = term_doc_mat.get_scaled_f_scores('subjective', scaler_algo='percentile')\n",
    "term_freq_df = term_freq_df.sort_values(by='Subjective Score', ascending=False)\n",
    "term_freq_df.iloc[:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>corpus</th>\n",
       "      <th>background</th>\n",
       "      <th>Log Posterior Mean Ratio</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>doesn</th>\n",
       "      <td>176.0</td>\n",
       "      <td>1101832.0</td>\n",
       "      <td>6.972770</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>isn</th>\n",
       "      <td>125.0</td>\n",
       "      <td>1345149.0</td>\n",
       "      <td>6.392687</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>discovers</th>\n",
       "      <td>70.0</td>\n",
       "      <td>1974534.0</td>\n",
       "      <td>5.356073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cinematic</th>\n",
       "      <td>49.0</td>\n",
       "      <td>1255895.0</td>\n",
       "      <td>5.091466</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>filmmaker</th>\n",
       "      <td>51.0</td>\n",
       "      <td>1493747.0</td>\n",
       "      <td>5.063639</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cannot</th>\n",
       "      <td>29.0</td>\n",
       "      <td>88737.0</td>\n",
       "      <td>4.860555</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>filmmaking</th>\n",
       "      <td>37.0</td>\n",
       "      <td>1061519.0</td>\n",
       "      <td>4.768377</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>thriller</th>\n",
       "      <td>78.0</td>\n",
       "      <td>5364843.0</td>\n",
       "      <td>4.722203</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>didn</th>\n",
       "      <td>32.0</td>\n",
       "      <td>850882.0</td>\n",
       "      <td>4.648173</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>filmmakers</th>\n",
       "      <td>39.0</td>\n",
       "      <td>1657073.0</td>\n",
       "      <td>4.629892</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>comedy</th>\n",
       "      <td>229.0</td>\n",
       "      <td>22993280.0</td>\n",
       "      <td>4.591236</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>quirky</th>\n",
       "      <td>35.0</td>\n",
       "      <td>1436076.0</td>\n",
       "      <td>4.553131</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>documentary</th>\n",
       "      <td>113.0</td>\n",
       "      <td>10429008.0</td>\n",
       "      <td>4.547708</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>film</th>\n",
       "      <td>1006.0</td>\n",
       "      <td>116097842.0</td>\n",
       "      <td>4.512189</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>entertaining</th>\n",
       "      <td>75.0</td>\n",
       "      <td>6330073.0</td>\n",
       "      <td>4.503101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mysterious</th>\n",
       "      <td>65.0</td>\n",
       "      <td>5252752.0</td>\n",
       "      <td>4.483029</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>decides</th>\n",
       "      <td>58.0</td>\n",
       "      <td>4588774.0</td>\n",
       "      <td>4.447191</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>performances</th>\n",
       "      <td>94.0</td>\n",
       "      <td>9272429.0</td>\n",
       "      <td>4.417802</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>learns</th>\n",
       "      <td>40.0</td>\n",
       "      <td>2570984.0</td>\n",
       "      <td>4.390325</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hasn</th>\n",
       "      <td>20.0</td>\n",
       "      <td>76625.0</td>\n",
       "      <td>4.352190</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              corpus   background  Log Posterior Mean Ratio\n",
       "doesn          176.0    1101832.0                  6.972770\n",
       "isn            125.0    1345149.0                  6.392687\n",
       "discovers       70.0    1974534.0                  5.356073\n",
       "cinematic       49.0    1255895.0                  5.091466\n",
       "filmmaker       51.0    1493747.0                  5.063639\n",
       "cannot          29.0      88737.0                  4.860555\n",
       "filmmaking      37.0    1061519.0                  4.768377\n",
       "thriller        78.0    5364843.0                  4.722203\n",
       "didn            32.0     850882.0                  4.648173\n",
       "filmmakers      39.0    1657073.0                  4.629892\n",
       "comedy         229.0   22993280.0                  4.591236\n",
       "quirky          35.0    1436076.0                  4.553131\n",
       "documentary    113.0   10429008.0                  4.547708\n",
       "film          1006.0  116097842.0                  4.512189\n",
       "entertaining    75.0    6330073.0                  4.503101\n",
       "mysterious      65.0    5252752.0                  4.483029\n",
       "decides         58.0    4588774.0                  4.447191\n",
       "performances    94.0    9272429.0                  4.417802\n",
       "learns          40.0    2570984.0                  4.390325\n",
       "hasn            20.0      76625.0                  4.352190"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "''' Display unigrams most characteristic of corpus against all of English that aren't unique to it.\n",
    "\n",
    "Note: \"doesn\", \"isn\", and \"didn\" are a result of the pre-tokenization of the corpus.\n",
    "'''\n",
    "characteristic_terms = term_doc_mat.get_posterior_mean_ratio_scores_vs_background()\n",
    "characteristic_terms[characteristic_terms['background'] > 0].iloc[:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [Root]",
   "language": "python",
   "name": "Python [Root]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}