{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/kesslej/anaconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", " \"This module will be removed in 0.20.\", DeprecationWarning)\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import scattertext as ST\n", "import tarfile, urllib, io\n", "import pandas as pd\n", "from IPython.display import IFrame\n", "from IPython.core.display import display, HTML\n", "display(HTML(\"\"))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "'''From Bo Pang's website: https://www.cs.cornell.edu/people/pabo/movie-review-data/\n", "\n", "Data from:\n", "A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization \n", "Based on Minimum Cuts'', Proceedings of the ACL, 2004\n", "'''\n", "SUBJECTIVITY_URL = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz'\n", "data = io.BytesIO(urllib.request.urlopen(SUBJECTIVITY_URL).read())\n", "tarball = tarfile.open(fileobj=data, mode = 'r:gz')\n", "readme = tarball.extractfile('subjdata.README.1.0').read()\n", "quote = tarball.extractfile('quote.tok.gt9.5000').read()\n", "plot = tarball.extractfile('plot.tok.gt9.5000').read()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['smart and alert , thirteen conversations about one thing is a small gem . ',\n", " 'color , musical bounce and warm seas lapping on island shores . and just enough science to send you home thinking . ',\n", " 'it is not a mass-market entertainment but an uncompromising attempt by one artist to think about another . ']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Examples of subjective sentences in corpus\n", "quote.decode('utf-8', errors='ignore').split('\\n')[:3]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "'''Construct subjective vs. objective pandas dataframe, \n", "treating review quotes as subjective, and plot points as objective.\n", "'''\n", "df = pd.DataFrame(\n", " [{'text': text.strip(), 'label': 'subjective'} for text \n", " in quote.decode('utf-8', errors='ignore').split('\\n')] \n", " + [{'text': text.strip(), 'label': 'objective'} for text \n", " in plot.decode('utf-8', errors='ignore').split('\\n')]\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "'''Convert Pandas dataframe to a term-document matrix, indicating\n", "the category column is \"label\" and the text column name is \"text\".'''\n", "\n", "\n", "term_doc_mat = ST.TermDocMatrixFromPandas(data_frame = df, \n", " category_col = 'label', \n", " text_col = 'text',\n", " # Note: use nlp=spacy.en.English() for text that's not pre-tokenized\n", " nlp = ST.fast_but_crap_nlp \n", " ).build()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "ename": "AssertionError", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mpmi_filter_thresold\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mminimum_term_frequency\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m width_in_pixels=1000)\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;31m# Hack to display HTML with D3 in Jupyter Notebook\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/kesslej/anaconda3/lib/python3.5/site-packages/scattertext-0.0.1.9.8-py3.5.egg/scattertext/__init__.py\u001b[0m in \u001b[0;36mproduce_scattertext_explorer\u001b[0;34m(corpus, category, category_name, not_category_name, protocol, pmi_filter_thresold, minimum_term_frequency, max_terms, filter_unigrams, height_in_pixels, width_in_pixels, max_snippets, max_docs_per_category, metadata, scores, singleScoreMode, use_full_doc, term_ranker)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[0mfilter_unigrams\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfilter_unigrams\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[0mmax_terms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmax_terms\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 151\u001b[0;31m \t term_ranker=term_ranker)\n\u001b[0m\u001b[1;32m 152\u001b[0m \tscatter_chart_data = scatter_chart_explorer.to_dict(category=category,\n\u001b[1;32m 153\u001b[0m \u001b[0mcategory_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcategory_name\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/kesslej/anaconda3/lib/python3.5/site-packages/scattertext-0.0.1.9.8-py3.5.egg/scattertext/ScatterChartExplorer.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, corpus, **kwargs)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \t\t'''\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCorpus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mScatterChart\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcorpus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mAssertionError\u001b[0m: " ] } ], "source": [ "'''\n", "Filter out bigrams with PMI < 3, and unigrams and bigrams that occur less than 20 times. \n", "The variable html is a string containing the HTML that makes up the scattertext visualization\n", "'''\n", "html = ST.produce_scattertext_html(term_doc_mat, \n", " category='subjective', \n", " category_name='Subjective', \n", " not_category_name='Objective',\n", " protocol='https',\n", " pmi_filter_thresold=3,\n", " minimum_term_frequency=20,\n", " width_in_pixels=1000)\n", "\n", "# Hack to display HTML with D3 in Jupyter Notebook\n", "open('subj_obj_scatter.html', 'wb').write(html.encode('utf-8'))\n", "IFrame(src='subj_obj_scatter.html', width = 1200, height=1000)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
objective freqsubjective freqSubjective Score
term
movie that0750.803250
entertaining2730.771629
film s2690.767533
but it61570.766663
i132750.755910
interesting3700.752203
film that4770.744846
performances5890.742972
of its61030.742011
in its5840.737945
me2510.737812
script4710.736981
movie is5830.736840
if you6960.736319
fascinating2480.730420
cinematic2470.727758
funny91260.726650
laughs0300.725776
movie s0300.725776
you re4640.725331
\n", "
" ], "text/plain": [ " objective freq subjective freq Subjective Score\n", "term \n", "movie that 0 75 0.803250\n", "entertaining 2 73 0.771629\n", "film s 2 69 0.767533\n", "but it 6 157 0.766663\n", "i 13 275 0.755910\n", "interesting 3 70 0.752203\n", "film that 4 77 0.744846\n", "performances 5 89 0.742972\n", "of its 6 103 0.742011\n", "in its 5 84 0.737945\n", "me 2 51 0.737812\n", "script 4 71 0.736981\n", "movie is 5 83 0.736840\n", "if you 6 96 0.736319\n", "fascinating 2 48 0.730420\n", "cinematic 2 47 0.727758\n", "funny 9 126 0.726650\n", "laughs 0 30 0.725776\n", "movie s 0 30 0.725776\n", "you re 4 64 0.725331" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "''' Display top 20 terms that are characteristic of a subjective document-label and their frequencies.\n", "'''\n", "term_freq_df = term_doc_mat.get_term_freq_df()\n", "term_freq_df['Subjective Score'] = term_doc_mat.get_scaled_f_scores('subjective', scaler_algo='percentile')\n", "term_freq_df = term_freq_df.sort_values(by='Subjective Score', ascending=False)\n", "term_freq_df.iloc[:20]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
corpusbackgroundLog Posterior Mean Ratio
doesn176.01101832.06.972770
isn125.01345149.06.392687
discovers70.01974534.05.356073
cinematic49.01255895.05.091466
filmmaker51.01493747.05.063639
cannot29.088737.04.860555
filmmaking37.01061519.04.768377
thriller78.05364843.04.722203
didn32.0850882.04.648173
filmmakers39.01657073.04.629892
comedy229.022993280.04.591236
quirky35.01436076.04.553131
documentary113.010429008.04.547708
film1006.0116097842.04.512189
entertaining75.06330073.04.503101
mysterious65.05252752.04.483029
decides58.04588774.04.447191
performances94.09272429.04.417802
learns40.02570984.04.390325
hasn20.076625.04.352190
\n", "
" ], "text/plain": [ " corpus background Log Posterior Mean Ratio\n", "doesn 176.0 1101832.0 6.972770\n", "isn 125.0 1345149.0 6.392687\n", "discovers 70.0 1974534.0 5.356073\n", "cinematic 49.0 1255895.0 5.091466\n", "filmmaker 51.0 1493747.0 5.063639\n", "cannot 29.0 88737.0 4.860555\n", "filmmaking 37.0 1061519.0 4.768377\n", "thriller 78.0 5364843.0 4.722203\n", "didn 32.0 850882.0 4.648173\n", "filmmakers 39.0 1657073.0 4.629892\n", "comedy 229.0 22993280.0 4.591236\n", "quirky 35.0 1436076.0 4.553131\n", "documentary 113.0 10429008.0 4.547708\n", "film 1006.0 116097842.0 4.512189\n", "entertaining 75.0 6330073.0 4.503101\n", "mysterious 65.0 5252752.0 4.483029\n", "decides 58.0 4588774.0 4.447191\n", "performances 94.0 9272429.0 4.417802\n", "learns 40.0 2570984.0 4.390325\n", "hasn 20.0 76625.0 4.352190" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "''' Display unigrams most characteristic of corpus against all of English that aren't unique to it.\n", "\n", "Note: \"doesn\", \"isn\", and \"didn\" are a result of the pre-tokenization of the corpus.\n", "'''\n", "characteristic_terms = term_doc_mat.get_posterior_mean_ratio_scores_vs_background()\n", "characteristic_terms[characteristic_terms['background'] > 0].iloc[:20]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [Root]", "language": "python", "name": "Python [Root]" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }