# Introduction to Scattertext

## @jasonkessler

https://github.com/JasonKessler/scattertext



Cite as:
Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.

Link to preprint: https://arxiv.org/abs/1703.00565

`
@article{kessler2017scattertext,
  author    = {Kessler, Jason S.},
  title     = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ},
  booktitle = {Proceedings of ACL-2017 System Demonstrations},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
}
`

In [15]:
%matplotlib inline
import scattertext as st
import re, io
from pprint import pprint
import pandas as pd
import numpy as np
from scipy.stats import rankdata, hmean, norm
import spacy.en
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML("<style>.container { width:98% !important; }</style>"))

In [2]:
nlp = spacy.en.English()

# Grab the 2012 political convention data set and preview it

In [3]:
convention_df = st.SampleCorpora.ConventionData2012.get_data()

In [4]:
convention_df.iloc[0]

party                                               democrat
speaker                                         BARACK OBAMA
text       Thank you. Thank you. Thank you. Thank you so ...
Name: 0, dtype: object

In [5]:
print("Document Count")
print(convention_df.groupby('party')['text'].count())
print("Word Count")
convention_df.groupby('party').apply(lambda x: x.text.apply(lambda x: len(x.split())).sum())
convention_df['parsed'] = convention_df.text.apply(nlp)

Document Count
party
democrat      123
republican     66
Name: text, dtype: int64
Word Count


# Turn it into a Scattertext corpus, and have spaCy parse it.

In [6]:
corpus = st.CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed').build()

# Scattertext has some functions to find how associated words are with categories
## Lots of ways to do this. I'm partial to a novel technique called Scaled F-Score
# Intutition:
### Associatied terms have a *relatively* high category-specific precision and recall
### F-score is the harmonic mean of precision and recall

## Terms we can calculate the Democratic precision and recall of each term
### - Given the balanced class labels and Zipf's law, precisions (around 50% given the class balance) will be much higher than recalls
### - Typically < 1% of documents contain a particular word
### - This will throw off the harmonic means, favoring frequent terms

In [29]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['dem_precision'] = term_freq_df['democrat freq'] * 1./(term_freq_df['democrat freq'] + term_freq_df['republican freq'])
term_freq_df['dem_recall'] = term_freq_df['democrat freq'] * 1./term_freq_df['democrat freq'].sum()
term_freq_df['dem_f_score'] = term_freq_df.apply(lambda x: (hmean([x['dem_precision'], x['dem_recall']])
                                                                   if x['dem_precision'] > 0 and x['dem_recall'] > 0 
                                                                   else 0), axis=1)                                                        
term_freq_df.sort_values(by='dem_f_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,democrat freq,republican freq,dem_precision,dem_recall,dem_f_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
the,3402,2532,0.573306,0.022343,0.043009
and,2709,2233,0.548159,0.017791,0.034464
to,2340,1667,0.583978,0.015368,0.029948
a,1602,1345,0.543604,0.010521,0.020643
of,1569,1377,0.532587,0.010304,0.020218
that,1400,1051,0.571195,0.009195,0.018098
we,1318,1146,0.534903,0.008656,0.017036
in,1291,986,0.566974,0.008479,0.016708
i,1098,851,0.563366,0.007211,0.01424
's,1037,631,0.621703,0.006811,0.013473


## Solution:
### Take the normal CDF of precision and recall scores, which will fall between 0 and 1, which scales and standardizes both scores.

In [30]:
#term_freq_df['dem_precision_pctl'] = rankdata(term_freq_df['dem_precision'])*1./len(term_freq_df)
#term_freq_df['dem_recall_pctl'] = rankdata(term_freq_df['dem_recall'])*1./len(term_freq_df)
def normcdf(x):
    return norm.cdf(x, x.mean(), x.std())
term_freq_df['dem_precision_normcdf'] = normcdf(term_freq_df['dem_precision'])
term_freq_df['dem_recall_normcdf'] = normcdf(term_freq_df['dem_recall'])
term_freq_df['dem_scaled_f_score'] = hmean([term_freq_df['dem_precision_normcdf'], term_freq_df['dem_recall_normcdf']])
term_freq_df.sort_values(by='dem_scaled_f_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,democrat freq,republican freq,dem_precision,dem_recall,dem_f_score,dem_precision_normcdf,dem_recall_normcdf,dem_scaled_f_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
middle class,148,18,0.891566,0.000972,0.001942,0.769762,1.0,0.869905
auto,37,0,1.0,0.000243,0.000486,0.83601,0.889307,0.861835
fair,45,3,0.9375,0.000296,0.000591,0.799485,0.933962,0.861507
insurance,54,6,0.9,0.000355,0.000709,0.775397,0.965959,0.860251
forward,105,16,0.867769,0.00069,0.001378,0.753443,0.999858,0.859334
president barack,47,4,0.921569,0.000309,0.000617,0.789447,0.942572,0.859241
class,161,25,0.865591,0.001057,0.002112,0.751919,1.0,0.858395
middle,164,27,0.858639,0.001077,0.002151,0.747021,1.0,0.855194
the middle,98,17,0.852174,0.000644,0.001286,0.742422,0.99964,0.852041
medicare,84,15,0.848485,0.000552,0.001103,0.739778,0.99805,0.849722


In [28]:
term_freq_df['dem_corner_score'] = corpus.get_rudder_scores('democrat')
term_freq_df.sort_values(by='dem_corner_score', ascending=True).iloc[:10]

Unnamed: 0_level_0,democrat freq,republican freq,Republican Score,Democratic Score,dem_corner_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
auto,37,0,0.0,0.773567,0.227781
america forward,28,0,0.0,0.7401,0.22787
insurance companies,24,0,0.0,0.721765,0.227934
auto industry,24,0,0.0,0.721765,0.227934
pell,23,0,0.0,0.716824,0.227961
last week,22,0,0.0,0.711735,0.22799
pell grants,21,0,0.0,0.706498,0.228024
platform,20,0,0.0,0.70111,0.228059
women 's,20,0,0.0,0.70111,0.228059
coverage,18,0,0.0,0.689877,0.228159


In [9]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Republican Score'] = corpus.get_scaled_f_scores('republican')
term_freq_df['Democratic Score'] = corpus.get_scaled_f_scores('democrat')
print("Top 10 Democratic terms")
pprint(list(term_freq_df.sort_values(by='Democratic Score', ascending=False).index[:10]))
print("Top 10 Republican terms")
pprint(list(term_freq_df.sort_values(by='Republican Score', ascending=False).index[:10]))

Top 10 Democratic terms
['auto',
 'america forward',
 'fought for',
 'fair',
 'insurance companies',
 'auto industry',
 'president barack',
 'pell',
 'fighting for',
 'last week']
Top 10 Republican terms
['unemployment',
 'do better',
 'liberty',
 'olympics',
 'built it',
 'reagan',
 'it has',
 'ann',
 'big government',
 'story of']


# Make and visualize chart, scale based on raw frequency.
### - A word used 10 times by Republicans will be at position 10 on the on the x-axis 
### - This isn't very useful.  Everything but the most frequent terms are squished the lower-left corner
### - The corner-distance scores are largely stopwords
### - By default, color words by Scaled F-Score

In [10]:
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    pmi_filter_thresold=4,
                                    transform=st.Scalers.scale,
                                    metadata=convention_df['speaker'])
file_name = 'Conventions2012ScattertextScale.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)


## Using log scales seems to help a bit, but blank space and stop words still dominate the graph
### The chracteristic terms look much more informative

In [11]:
html = st.produce_scattertext_explorer(corpus,
                                       category='democrat',
                                       category_name='Democratic',
                                       not_category_name='Republican',
                                       minimum_term_frequency=5,
                                       pmi_filter_thresold=4,
                                       width_in_pixels=1000,
                                       transform=st.Scalers.log_scale_standardize)
file_name = 'Conventions2012ScattertextLog.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)


# Rank terms by frequency percentiles instead of raw frequenies.  
### A term at the middle of the x-axis will be mentioned by Republicans at the median frequency.
### This nicely distributes terms throughout the space
### But, terms occuring with the same frequencies in both classes are stacked atop each other.
### Can't mouseover points not at top of stack.

In [12]:
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    pmi_filter_thresold=4,                                    
                                    transform=st.Scalers.percentile,
                                    metadata=convention_df['speaker'])
file_name = 'Conventions2012ScattertextRankData.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)


# One solution is to randomly jitter each point
## Points don't leave enough space for many labels
## Top terms laregely result of jitter

In [13]:
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    width_in_pixels=1000,
                                    jitter=0.1,
                                    minimum_term_frequency=5,
                                    pmi_filter_thresold=4,
                                    transform=st.Scalers.percentile,
                                    metadata=convention_df['speaker'])
file_name = 'Conventions2012ScattertextRankDataJitter.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)


# The preferred solution is to fall back to alphabetic order among equally frequent terms
## Lets you mouseover all points
## Leaves a bit of room for labels
## Top points may be slightly distorted

In [14]:
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    pmi_filter_thresold=4,
                                    metadata=convention_df['speaker'],
                                    term_significance = st.LogOddsRatioUninformativeDirichletPrior())
file_name = 'Conventions2012ScattertextRankDefault.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

# Scattertext can also be used for alternative visualizations
## Visualize L2-penalized logistic regression coefficients vs. log term frequency
Similar to Monroe et al. (2008).

Burt L. Monroe, Michael P. Colaresi, and Kevin M. Quinn. 2008. Fightinâ€™ words: Lexical feature selection and evaluation for identifying the content of political conflict. Political Analysis.

In [18]:
from sklearn.linear_model import LogisticRegression
def scale(ar): 
    return (ar - ar.min()) / (ar.max() - ar.min())

def zero_centered_scale(ar):
    ar[ar > 0] = scale(ar[ar > 0])
    ar[ar < 0] = -scale(-ar[ar < 0])
    return (ar + 1) / 2.

frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))
scores = corpus.get_logreg_coefs('democrat',
                                 LogisticRegression(penalty='l2', C=10, max_iter=10000, n_jobs=-1))
scores_scaled = zero_centered_scale(scores)

html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    minimum_term_frequency=5,
                                    pmi_filter_thresold=4,
                                    width_in_pixels=1000,
                                    x_coords=frequencies_scaled,
                                    y_coords=scores_scaled,
                                    scores=scores,
                                    sort_by_dist=False,
                                    metadata=convention_df['speaker'],
                                    x_label='Log frequency',
                                    y_label='L2-penalized logistic regression coef')
file_name = 'L2vsLog.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)