In [2]:
import scattertext as st
import re
from pprint import pprint
import numpy as np
import pandas as pd
import spacy.en
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [3]:
nlp = spacy.en.English()

# Parse debates and create plotting interface
The function returns a Pandas data frame consisting of two columns, speaker and statement.  Speaker is the name of the speaker, given in all caps, and statement is the speech made during a particular turn.  

In [4]:
def debate_transcript_to_dataframe(fn, speakers):
    lines = open(fn).read().split('\n')
    cur_speaker = None
    speaker_start_re = re.compile(r'^([(]?[A-Z][A-Z][A-Z]+):?(.+)$')
    transcript = []
    cur_statement = ''
    cur_speaker = None
    for line in lines:
        match = speaker_start_re.match(line)
        if match:
            if match.group(1).startswith('('):
                continue
            if cur_speaker is not None:
                transcript.append({'speaker': cur_speaker, 'statement': cur_statement})
            cur_speaker = match.group(1).strip()
            cur_statement = match.group(2).strip() + '\n'
            for other_name in speakers:
                if other_name+':' in cur_statement:
                    cur_statement, other_statement = cur_statement.split(other_name)
                    transcript.append({'speaker': cur_speaker, 'statement': cur_statement.strip()})
                    transcript.append({'speaker': other_name, 'statement': other_statement.strip()})   
        else:
            cur_statement += line 
    df = pd.DataFrame(transcript)
    return df

## Read debates into Pandas data frames

In [5]:
parties = {'QUIJANO':'Moderator', 
           'KAINE':'Democratic', 
           'PENCE':'Republican', 
           'HOLT':'Moderator', 
           'CLINTON':'Democratic', 
           'TRUMP':'Republican',
           'COOPER':'Moderator',
           'RADDATZ':'Moderator',
           'WALLACE':'Moderator'}

debate_dfs = {}
for info in [
    {'debate': '1st', 'fn': 'presidential-debate-2016-09-26.txt', 'participants': ['TRUMP','CLINTON','HOLT']},
    {'debate': 'VP', 'fn': 'vp-debate-2016-10-04.txt', 'participants': ['PENCE','KAINE','QUIJANO']},
    {'debate': '2nd', 'fn': 'debate-2016-10-09-rush.txt', 'participants': ['TRUMP','CLINTON','COOPER','RADDATZ']},
    {'debate': '3rd', 'fn': 'debate-2016-10-19.txt', 'participants': ['TRUMP','CLINTON','WALLACE']}]:
    cur_df = debate_transcript_to_dataframe(info['fn'], info['participants'])
    cur_df['debate'] = info['debate']
    cur_df['party'] = cur_df['speaker'].apply(lambda x: parties[x])
    cur_df['speaker and debate']=cur_df['speaker'].apply(lambda x: x + ' ' + info['debate'])
    debate_dfs[info['debate']] = cur_df   
df_all = pd.concat(debate_dfs.values())
df_all.iloc[:2]

Unnamed: 0,speaker,statement,debate,party,speaker and debate
0,QUIJANO,Good evening. From Longwood University in Farm...,VP,Moderator,QUIJANO VP
1,QUIJANO,"I'm Elaine Quijano, anchor at CBSN, and corres...",VP,Moderator,QUIJANO VP


In [6]:
df_all.to_csv('presidential_debates_2016.csv.gz', compression='gzip', index=False)

In [7]:
!cp presidential_debates_2016.csv.gz ../scattertext/scattertext/data/

## Function to draw scatter plot in notebook. 
Creates a chart from text in a data frame, `df`.  The `category` and `other_category` parameters are the names of the columns we'll compare.  The `category_col` is the column in `df` that contains document categories, and contains `category` and `other_category`.  For example, if `category` is "TRUMP", then `category_col` would be "speaker". `extra` is append to the file name of the html file produced. 

We'll look at the rest of the optional parameters later.

The function returns an iFrame containing containing the HTML visualization, and as a side-effect writes the visualization to an html file, named `category.lower() + '-' + other_category.lower() + extra + '.html'`.

In [10]:
def draw_corpus(df, corpus, category, other_category, category_col, extra='', scores=None, singleScoreMode=False, 
                minimum_term_frequency=2, grey_zero_scores=False, sort_by_dist=True):
    html = st.produce_scattertext_explorer(corpus, 
                                           category=category, 
                                           category_name=category.lower(), 
                                           not_category_name=other_category.lower(),
                                           pmi_filter_thresold=2,
                                           minimum_term_frequency=minimum_term_frequency,
                                           metadata=df['speaker and debate'],
                                           scores=scores,
                                           width_in_pixels=1000,
                                           grey_zero_scores=grey_zero_scores,
                                           singleScoreMode=singleScoreMode,
                                           sort_by_dist=sort_by_dist)
    file_name = category.lower() + '-' + other_category.lower() + extra + '.html'
    open(file_name, 'wb').write(html.encode('utf-8'))
    return IFrame(src=file_name, width = 1200, height=1000)

def draw_plot(df, category, other_category, category_col, extra=''):
    # Scattertext can only do a one column vs. all analysis.  We're excluding any other speakrs
    category_vs_other_df = df[(df[category_col] == category) | (df[category_col] == other_category)]
    corpus = st.CorpusFromPandas(category_vs_other_df, 
                                 category_col = category_col, 
                                 text_col = 'statement',
                                 nlp = nlp).build()
    return draw_corpus(category_vs_other_df,  corpus, category, other_category, category_col, extra=extra)

# Find the top words used by the candidates in the 3rd debate

In [11]:

category, other_category, category_col = 'CLINTON', 'TRUMP', 'speaker'
debate_3 = st.CorpusFromPandas(data_frame = debate_dfs['3rd'][( debate_dfs['3rd'][category_col] == category) 
                                                              | ( debate_dfs['3rd'][category_col] == other_category)], 
                               category_col = category_col, 
                               text_col = 'statement',
                               nlp = nlp).build()

term_df = debate_3.get_term_freq_df()
term_df['Trump'] = debate_3.get_scaled_f_scores('TRUMP')
term_df['Clinton'] = debate_3.get_scaled_f_scores('CLINTON')

print('Trump top terms')
print(term_df.sort_values(by='Trump', ascending=False).iloc[:20].index)
print('Clinton top terms')
print(term_df.sort_values(by='Clinton', ascending=False).iloc[:20].index)

Trump top terms
Index(['hillary', 'bad', 'she wants', 'you have', 'the border', 'and she',
       'justices', 'signed', 'percent', 'strong', 'outsmarted', 'a disaster',
       'she 's', 'deals', 'no idea', 'have no', 'start', 'appoint', 'pouring',
       'the baby'],
      dtype='object', name='term')
Clinton top terms
Index(['women', 'kind of', 'against', 'that is', 'work', 'stand',
       'undocumented', 'also', 'most', 'guns', 'stand up', 'the debt',
       'the kind', 'rights', 'against it', 'v.', 'million', 'families',
       'new jobs', 'should be'],
      dtype='object', name='term')


#  Clinton vs. Trump word use

In [12]:
draw_plot(df_all, 'CLINTON', 'TRUMP', 'speaker')

In [8]:
draw_plot(df_all, 'KAINE', 'PENCE', 'speaker')

In [9]:
draw_plot(df_all, 'Democratic', 'Republican', 'party')

# Visualize LDA topic model of the debates

## First, create a corpus of all the 2016 debates

In [17]:
df_dem_rep = df_all[df_all.party.isin({'Democratic', 'Republican'})]
corpus = st.CorpusFromPandas(df_dem_rep, 
                             category_col = 'party', 
                             text_col = 'statement',
                             nlp = nlp).build()

## Filter out bigrams and stopwords from the corpus, making a new one called `corpus_uni_stop`

In [18]:
corpus_uni_stop = corpus.get_stoplisted_unigram_corpus()

## Train two, party-specifc topic models and one general model

In [19]:
from sklearn.decomposition import LatentDirichletAllocation
lda_models = {}
for party in ['Republican', 'Democratic', 'General']:
    #subset the term-document matrix to only speech from one paraty or aanother
    if party != 'General':
        X = corpus_uni_stop._X[corpus_uni_stop._y == corpus_uni_stop.get_categories().index(party),:]
    else:
        X = corpus_uni_stop._X
    lda_models[party] = (LatentDirichletAllocation(n_topics=20, 
                                                   max_iter=60,
                                                   learning_method='online',
                                                   learning_offset=50.,
                                                   random_state=0)
                         .fit(X))

In [20]:
def top_words_in_topic(scores, corpus, n_top_words):
    return [corpus._term_idx_store.getval(i) for i 
            in scores.argsort()[:-n_top_words - 1:-1]]

def print_some_topics(model):
    for topic_idx, topic in list(enumerate(model.components_))[:3]:
        print("Topic #%d:" % topic_idx)
        print(', '.join(top_words_in_topic(model.components_[topic_idx], corpus_uni_stop, 10)))
        
    print()
print("Some General Topics")
print_some_topics(lda_models['General'])
print("Some Republican Topics")
print_some_topics(lda_models['Republican'])
print("Some Democratic Topics")
print_some_topics(lda_models['Democratic'])


Some General Topics
Topic #0:
concerned, office, troubling, installers, deeply, visited, far, threat, man, met
Topic #1:
chicago, puppet, shootings, 4,000, guns, 1st, january, terrible, 2014, shared
Topic #2:
trashing, muslims, syrians, pick, bookstore, tomorrow, announced, book, called, stronger

Some Republican Topics
Topic #0:
years, look, 30, entitled, miss, imagine, deductions, half, debt, number
Topic #1:
nonsense, oh, telling, puppet, cybersecurity, surge, speak, seventh, circuit, respectful
Topic #2:
slowest, recovery, great, economic, depression, $, audit, release, returns, audited

Some Democratic Topics
Topic #0:
security, going, social, trade, read, solvent, book, energy, enforce, voted
Topic #1:
apologize, crisis, collapse, fact, worst, shared, minutes, dramatically, improved, 2014
Topic #2:
think, donald, people, prepared, yes, undocumented, promised, privatize, said, jobs



In [14]:
topic_idx = 1
party='Democratic'
print('Top terms in Dem topic %s' % topic_idx, 
      top_words_in_topic(lda_models[party].components_[topic_idx], corpus_uni_stop, 10))
draw_corpus(df_dem_rep, 
            corpus_uni_stop, 
            'Democratic', 
            'Republican', 
            'party', 
            extra='_dem_topic_%s'%(topic_idx), 
            scores = lda_models[party].components_[topic_idx],
            minimum_term_frequency=1,
            singleScoreMode=True)

Top terms in Dem topic 1 ['actually', 'admit', 'completely', 'polished', 'values', 'antithetical', 'jeffersonian', 'bit', 'borders', 'open']


In [15]:
topic_idx = 7
party='Republican'
print('Top terms in Rep topic %s' % topic_idx, 
      top_words_in_topic(lda_models[party].components_[topic_idx], corpus_uni_stop, 10))
draw_corpus(df_dem_rep, 
            corpus_uni_stop, 
            'Democratic', 
            'Republican', 
            'party', 
            extra='_rep_topic_%s'%(topic_idx), 
            scores = lda_models[party].components_[topic_idx],
            minimum_term_frequency=1,
            singleScoreMode=True)

Top terms in Rep topic 7 ['doubt', 'jail', 'debunked', 'ugh', 'taunting', 'yeah', 'doing', 'just', 'defective', 'lester']


In [16]:
topic_idx = 2
party='General'
print('Top terms in general topic %s' % topic_idx, 
      top_words_in_topic(lda_models[party].components_[topic_idx], corpus_uni_stop, 10))
draw_corpus(df_dem_rep, 
            corpus_uni_stop, 
            'Democratic', 
            'Republican', 
            'party', 
            extra='_gen_topic_%s'%(topic_idx), 
            scores = lda_models[party].components_[topic_idx],
            minimum_term_frequency=1,
            singleScoreMode=True)

Top terms in general topic 2 ['shared', 'bet', 'u.s.', 'program', 'suspended', 'citizen', 'announced', 'website', 'think', 'actually']


## Visualizing Word2Vec term similarity 

### Score each term in corpus against the word "job".  SpaCy includes 300-dimensional word vectors and a cosine-similarity function.

In [23]:
base_term_text = 'job'
base_term = nlp(base_term_text)
scores=np.array([base_term.similarity(nlp(tok)) 
                 for tok 
                 in corpus_uni_stop._term_idx_store._i2val])



print('Terms that are most similar to "%s"' % base_term)
print(top_words_in_topic(scores, corpus_uni_stop, 10))
draw_corpus(df_dem_rep, 
            corpus_uni_stop, 
            'Democratic', 
            'Republican', 
            'party', 
            extra = '_embedding_' + base_term_text, 
            scores = scores,
            minimum_term_frequency=1,
            singleScoreMode=True)

Terms that are most similar to "job"
['job', 'jobs', 'position', 'role', 'work', 'career', 'duty', 'responsibilities', 'tenure', 'contract']


In [3]:
base_term = nlp('wealth')
scores=np.array([base_term.similarity(nlp(tok)) 
                 for tok 
                 in corpus_uni_stop._term_idx_store._i2val])

print('Terms that are most similar to "%s"' % base_term)
print(top_words_in_topic(scores, corpus_uni_stop, 10))
draw_corpus(df_dem_rep, 
            corpus_uni_stop, 
            'Democratic', 
            'Republican', 
            'party', 
            extra='_dem_topic_%s'%(topic_idx), 
            scores = scores,
            minimum_term_frequency=1,
            singleScoreMode=True)

NameError: name 'nlp' is not defined

# Let's use Scattertext to inspect the coefficients from Lasso-logistic regression
* We can see the accuracies used 

In [19]:
l1scores, acc, baseline = corpus_uni_stop.get_logistic_regression_coefs_l1('Democratic')
print('Terms have the highest lasso coefficients for predicting Democrats are')
print(top_words_in_topic(l1scores, corpus_uni_stop, 10))
print('Terms have the highest lasso coefficients for predicting Republicans are')
print(top_words_in_topic(-1*l1scores, corpus_uni_stop,  10))
print('Cross-validated classification accuracy', acc)
print('Baseline (class-conditional) accuracy', baseline)



Terms have the highest lasso coefficients for predicting Democrats are
['chief', 'kind', 'donald', 'mistake', 'intelligence', 'worked', 'debate', 'vladimir', 'good', 'book']
Terms have the highest lasso coefficients for predicting Republicans are
['tell', 'clinton', 'mean', 'tremendous', 'kaine', 'change', 'country', 'stop', 'happy', 'respond']
Cross-validated classification accuracy 0.627182044888
Baseline (class-conditional) accuracy 0.56608478803


In [20]:
#draw_corpus(df_dem_rep,  corpus_uni_stop, 'Democratic', 'Republican', 'party', extra='lasso')
draw_corpus(df_dem_rep, 
            corpus_uni_stop, 
            'Democratic', 
            'Republican', 
            'party', 
            extra='lasso', 
            scores = l1scores,
            minimum_term_frequency=1,
            sort_by_dist = False,
            grey_zero_scores = True)

# Comare Clinton and Trump's 1st debate

In [21]:
draw_plot(debate_dfs['1st'], 'CLINTON', 'TRUMP', 'speaker', '1st')

# Compare Trump to Pence

In [22]:
draw_plot(df_all, 'TRUMP', 'PENCE', 'speaker', '_trumppence')

# Compare the 1st to the 2nd debate

In [23]:
draw_plot(df_all, '1st', '2nd', 'debate', '_1st_vs_2nd')