In [1]:
import scattertext as ST
import tarfile, urllib, io
import pandas as pd
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))



In [2]:
'''From Bo Pang's website: https://www.cs.cornell.edu/people/pabo/movie-review-data/

Data from:
A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization 
Based on Minimum Cuts'', Proceedings of the ACL, 2004
'''
SUBJECTIVITY_URL = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz'
data = io.BytesIO(urllib.request.urlopen(SUBJECTIVITY_URL).read())
tarball = tarfile.open(fileobj=data, mode = 'r:gz')
readme = tarball.extractfile('subjdata.README.1.0').read()
quote = tarball.extractfile('quote.tok.gt9.5000').read()
plot = tarball.extractfile('plot.tok.gt9.5000').read()

In [3]:
# Examples of subjective sentences in corpus
quote.decode('utf-8', errors='ignore').split('\n')[:3]

['smart and alert , thirteen conversations about one thing is a small gem . ',
 'color , musical bounce and warm seas lapping on island shores . and just enough science to send you home thinking . ',
 'it is not a mass-market entertainment but an uncompromising attempt by one artist to think about another . ']

In [4]:
'''Construct subjective vs. objective pandas dataframe, 
treating review quotes as subjective, and plot points as objective.
'''
df = pd.DataFrame(
    [{'text': text.strip(), 'label': 'subjective'} for text 
     in quote.decode('utf-8', errors='ignore').split('\n')] 
    + [{'text': text.strip(), 'label': 'objective'} for text 
       in plot.decode('utf-8', errors='ignore').split('\n')]
)

In [5]:
'''Convert Pandas dataframe to a term-document matrix, indicating
the category column is "label" and the text column name is "text".'''


term_doc_mat = ST.TermDocMatrixFromPandas(data_frame = df, 
                                          category_col = 'label', 
                                          text_col = 'text',
                                          # Note: use nlp=spacy.en.English() for text that's not pre-tokenized
                                          nlp = ST.fast_but_crap_nlp 
                                         ).build()

In [9]:
'''
Filter out bigrams with PMI < 3, and unigrams and bigrams that occur less than 20 times.  
The variable html is a string containing the HTML that makes up the scattertext visualization
'''
html = ST.produce_scattertext_html(term_doc_mat, 
                                   category='subjective', 
                                   category_name='Subjective', 
                                   not_category_name='Objective',
                                   protocol='https',
                                   pmi_filter_thresold=3,
                                   minimum_term_frequency=20,
                                   width_in_pixels=1000)

# Hack to display HTML with D3 in Jupyter Notebook
open('subj_obj_scatter.html', 'wb').write(html.encode('utf-8'))
IFrame(src='subj_obj_scatter.html', width = 1200, height=1000)

AssertionError: 

In [7]:
''' Display top 20 terms that are characteristic of a subjective document-label and their frequencies.
'''
term_freq_df = term_doc_mat.get_term_freq_df()
term_freq_df['Subjective Score'] = term_doc_mat.get_scaled_f_scores('subjective', scaler_algo='percentile')
term_freq_df = term_freq_df.sort_values(by='Subjective Score', ascending=False)
term_freq_df.iloc[:20]

Unnamed: 0_level_0,objective freq,subjective freq,Subjective Score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
movie that,0,75,0.80325
entertaining,2,73,0.771629
film s,2,69,0.767533
but it,6,157,0.766663
i,13,275,0.75591
interesting,3,70,0.752203
film that,4,77,0.744846
performances,5,89,0.742972
of its,6,103,0.742011
in its,5,84,0.737945


In [8]:
''' Display unigrams most characteristic of corpus against all of English that aren't unique to it.

Note: "doesn", "isn", and "didn" are a result of the pre-tokenization of the corpus.
'''
characteristic_terms = term_doc_mat.get_posterior_mean_ratio_scores_vs_background()
characteristic_terms[characteristic_terms['background'] > 0].iloc[:20]

Unnamed: 0,corpus,background,Log Posterior Mean Ratio
doesn,176.0,1101832.0,6.97277
isn,125.0,1345149.0,6.392687
discovers,70.0,1974534.0,5.356073
cinematic,49.0,1255895.0,5.091466
filmmaker,51.0,1493747.0,5.063639
cannot,29.0,88737.0,4.860555
filmmaking,37.0,1061519.0,4.768377
thriller,78.0,5364843.0,4.722203
didn,32.0,850882.0,4.648173
filmmakers,39.0,1657073.0,4.629892
