In [1]:
import requests
import pandas as pd
import numpy as np
import scattertext as st
import spacy
import time
from IPython.display import IFrame
from IPython.core.display import display, HTML
from bokeh.palettes import PuBu
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, ranges, LabelSet
from bokeh.plotting import figure
output_notebook()
display(HTML("<style>.container { width:98% !important; }</style>"))
%matplotlib inline

In [2]:
assert [int(x) for x in st.__version__.split('.')] >=[0,0,2,20]

In [2]:
url = 'https://openreview.net/notes?invitation=ICLR.cc%2F2018%2FConference%2F-%2FBlind_Submission&offset=0&limit=1000'
df = pd.DataFrame(requests.get(url).json()['notes'])

In [1]:
forum_content = []
for i, forum_id in list(enumerate(df.forum)):
    notes_url = 'https://openreview.net/notes?forum={}&trash=true'.format(forum_id)
    try:
        forum_content.append(requests.get(notes_url).json())
    except:
        print('err', i, forum_id)
        forum_content = {}
    time.sleep(.3)
df['forumContent'] = pd.Series(forum_content)   

df.to_csv('iclr2018_raw.csv.bz2', index=False, compression='bz2')

NameError: name 'df' is not defined

In [4]:
#
# Code here is to read locally:
read_local = True
if read_local:
    df = pd.read_csv('iclr2018_raw.csv.bz2')
    df['forumContent'] = df.forumContent.apply(eval) # totally unsafe
    df['content'] = df.content.apply(eval) 

In [5]:
df['decision_raw'] = df.forumContent.apply(lambda x:[n['content']['decision'] 
                                                     for n in x['notes'] 
                                                     if 'decision' in n['content']][0])
df['decision_raw'].value_counts()

Reject                      504
Accept (Poster)             313
Invite to Workshop Track     90
Accept (Oral)                23
Name: decision_raw, dtype: int64

In [6]:
len(df)

930

In [7]:
df['title'] = df.content.apply(lambda x: x['title'])
df['authors'] = df.content.apply(lambda x: x['authors'])

only_reviews_df = pd.concat(df.forumContent.apply(lambda c: pd.DataFrame([
    {'review': n['content']['review'], 
     'rating': n['content']['rating'],  
     'confidence': n['content']['confidence'],
     'forum': n['forum']} 
    for n in c['notes'] 
    if 'content' in n and 'review' in n['content']
])).tolist())
reviews_df = pd.merge(df[['title', 'authors', 'decision_raw', 'forum']], only_reviews_df, on='forum')
#reviews_df.groupby('decision_raw')['rating'].value_counts()
reviews_df['decision'] = (reviews_df['decision_raw']
                          .apply(lambda x: 'Reject' if x == 'Reject' 
                                 else ('Accept' if x.startswith('Accept') 
                                       else 'Workshop')))
reviews_df['rating_bin'] = (reviews_df['rating']
                            .apply(lambda x: (lambda s: 'Negative' if s < 5 
                                              else ('Positive' if s > 6 else 'Neutral'))
                                   (int(x.split(':')[0].strip()))))
reviews_df['category'] = reviews_df['decision'] + ', ' + reviews_df['rating_bin']

In [7]:
decisions = reviews_df[['forum','decision_raw']].drop_duplicates()['decision_raw'].value_counts()
source = ColumnDataSource(dict(x=list(decisions.index),y=decisions.values))

#source = ColumnDataSource({'x': decisions.index, 'y': decisions.values}

plot = figure(plot_width=600, plot_height=300, tools="save",
        x_axis_label = "Decision",
        y_axis_label = "Paper Count",
        title="",
        x_minor_ticks=2,
        x_range = source.data["x"],
        y_range= ranges.Range1d(start=0,end=600))


labels = LabelSet(x='x', y='y', text='y', level='glyph',
        x_offset=-13.5, y_offset=0, source=source, render_mode='canvas')

plot.vbar(source=source,x='x',top='y',bottom=0,width=0.3,color=PuBu[7][2])

plot.add_layout(labels)
show(plot)

In [8]:
ratings = reviews_df['rating'].value_counts()
ratings.index = [int(c.split(':')[0]) for c in ratings.index]
ratings = ratings.sort_index()
source = ColumnDataSource(dict(x=[str(x) for x in ratings.index],y=ratings.values))

plot = figure(plot_width=600, plot_height=300, tools="save",
        x_axis_label = "Rating",
        y_axis_label = "Review Count",
        title="",
        x_minor_ticks=2,
        x_range = source.data["x"],
        y_range= ranges.Range1d(start=0,end=ratings.max() + 100))


labels = LabelSet(x='x', y='y', text='y', level='glyph',
        x_offset=-13.5, y_offset=0, source=source, render_mode='canvas')

plot.vbar(source=source,x='x',top='y',bottom=0,width=0.3,color=PuBu[7][2])

plot.add_layout(labels)
show(plot)

  elif np.issubdtype(type(obj), np.float):


In [10]:

reviews_df['metadata'] = (
    reviews_df['title'] + '<br/>Score: ' + reviews_df['rating'].apply(lambda x: x.split(':')[0]) + '/10'
    + '<br/>Confidence: ' + reviews_df['confidence'].apply(lambda x: x.split(':')[0]) + '/5'
    + '<br/>Ultimate decision: ' + reviews_df['decision'].apply(lambda x: x.split(':')[0]) + '/10'
)


In [8]:
reviews_df.to_csv('iclr2018_reviews.csv.bz2', index=False, compression='bz2')

## Start here for NLP

In [2]:
reviews_df = pd.read_csv('https://github.com/JasonKessler/ICLR18ReviewVis/raw/master/iclr2018_reviews.csv.bz2')

In [4]:
nlp = spacy.load('en')
reviews_df['parse'] = reviews_df['review'].apply(nlp)

In [50]:
corpus = (st.CorpusFromParsedDocuments(reviews_df, category_col = 'rating_bin', parsed_col = 'parse')
          .build()
          .remove_categories(['Neutral']))

In [51]:
html = st.produce_scattertext_explorer(corpus, 
                                       category='Positive', 
                                       not_categories=['Negative'],
                                       transform = st.Scalers.percentile_dense,
                                       term_scorer = st.RankDifference(),
                                       metadata = corpus.get_df()['metadata'])
file_name = '../jasonkessler.github.io/iclr2018reviews/pos_neg_dense.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)

6131293

In [7]:
four_square_corpus = (st.CorpusFromParsedDocuments(reviews_df, category_col = 'category', parsed_col = 'parse')
                      .build()
                      .get_unigram_corpus()
                      .compact(st.ClassPercentageCompactor(term_count=1)))

In [30]:
file_name = '../jasonkessler.github.io/iclr2018reviews/accept_reject_four_square_axes.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)

8267592

In [34]:
four_square_axes = st.FourSquareAxes(four_square_corpus, 
                                     left_categories=['Accept, Positive'], 
                                     right_categories=['Accept, Negative'], 
                                     top_categories=['Reject, Positive'], 
                                     bottom_categories=['Reject, Negative'], 
                                     labels = {'a': 'Positive',
                                               'b': 'Review that was Contrary to Accpetance Decision',
                                               'not_a': 'Negative',
                                               'not_b': 'Review that in Line With Acceptance Decision'},
                                     term_ranker=st.OncePerDocFrequencyRanker)
html = st.produce_four_square_axes_explorer(
    four_square_axes=four_square_axes,
    x_label="Accepts: Pos-Neg",
    y_label='Rejects: Neg-Pos',
    use_full_doc=True,
    metadata=four_square_corpus.get_df()['metadata'],
    color_func='(function(d) {return d3.rgb(230, 220, 230)})',
    censor_points = False,
)
file_name = '../jasonkessler.github.io/iclr2018reviews/accept_reject_four_square_axes_display.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)

8205863

In [36]:
four_square_axes = st.FourSquareAxes(four_square_corpus, 
                                     left_categories=['Accept, Positive'], 
                                     right_categories=['Accept, Negative'], 
                                     top_categories=['Reject, Positive'], 
                                     bottom_categories=['Reject, Negative'], 
                                     labels = {'a': 'Positive',
                                               'b': 'Review that was Contrary to Accpetance Decision',
                                               'not_a': 'Negative',
                                               'not_b': 'Review that in Line With Acceptance Decision'},
                                     term_ranker=st.OncePerDocFrequencyRanker)
html = st.produce_four_square_axes_explorer(
    four_square_axes=four_square_axes,
    x_label="Accepts: Pos-Neg",
    y_label='Rejects: Neg-Pos',
    use_full_doc=True,
    metadata=four_square_corpus.get_df()['metadata'],
    color_func='(function(d) {return d3.rgb(230, 220, 230)})',
)
file_name = '../jasonkessler.github.io/iclr2018reviews/accept_reject_four_square_axes_interactive.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)

8205862

In [35]:
four_square= st.FourSquare(four_square_corpus, 
                             category_a_list=['Accept, Positive'], 
                             category_b_list=['Accept, Negative'], 
                             not_category_b_list=['Reject, Positive'], 
                             not_category_a_list=['Reject, Negative'], 
                             labels = {'a_and_b': 'Accept',
                                       'not_a_and_not_b': 'Reject',
                                       'a_and_not_b': 'Positive',
                                       'b_and_not_a': 'Negative'},
                             term_ranker=st.OncePerDocFrequencyRanker)
html = st.produce_four_square_explorer(
    four_square=four_square,
    y_label='Accept-Reject',
    x_label='Positive-Negative',
    use_full_doc=True,
    metadata = four_square_corpus.get_df()['metadata'],
)
file_name = '../jasonkessler.github.io/iclr2018reviews/accept_reject_four_square.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)

8206300

In [11]:
#corpus = corpus.remove_infrequent_words(5)
t0 = time.time()
compact_corpus = st.CompactTerms(corpus, st.OncePerDocFrequencyRanker, 5).compact()
print(time.time() - t0)

29.19629192352295


In [20]:
fine_grain_corpus = (st.CorpusFromParsedDocuments(reviews_df, category_col='category', parsed_col='parse').build())

In [22]:
fine_grain_corpus.get_categories()

['Reject, Negative',
 'Reject, Neutral',
 'Accept, Negative',
 'Accept, Positive',
 'Reject, Positive',
 'Workshop, Neutral',
 'Accept, Neutral',
 'Workshop, Negative',
 'Workshop, Positive']

In [43]:
fine_grain_corpus_compact = st.CompactTerms(fine_grain_corpus, st.OncePerDocFrequencyRanker, 5).compact()

In [45]:
len(fine_grain_corpus_compact.get_terms()), len(fine_grain_corpus.get_terms())

(31640, 307829)

In [46]:
tdf = st.OncePerDocFrequencyRanker(fine_grain_corpus).get_ranks()
ap_vs_rp = st.RankDifference().get_scores(tdf['Accept, Positive freq'], tdf['Reject, Positive freq'])
print(terms.iloc[:10].index)
print(terms.iloc[-10:].index)

Index(['case for', 'evaluating', 'closer', 'closer to', 'machines',
       'applications', 'e.g. the', 'node', 'doing', 'are of'],
      dtype='object', name='term')
Index(['between', 'way', 'only', 'first', '/', 'method', 'given', 'about',
       'to see', 'see'],
      dtype='object', name='term')


In [51]:
an_vs_rn = st.RankDifference().get_scores(tdf['Reject, Positive freq'], tdf['Accept, Positive freq'])
print(terms.iloc[:10].index)
print(terms.iloc[-10:].index)

Index(['here the', 'observations', 'authors show', 'valuable', 'find that',
       'it ’s', 'from table', 'method which', 'put', 'the process'],
      dtype='object', name='term')
Index(['model', 'no', 'for the', 'new', 'neural', 'are not', 'dataset',
       'these', 'about', 'network'],
      dtype='object', name='term')


In [73]:
four_square = st.FourSquare(fine_grain_corpus_compact, 
                            ['Accept, Positive'], 
                            ['Reject, Positive'],
                            ['Accept, Negative'], 
                            ['Reject, Negative'], 
              term_ranker=st.OncePerDocFrequencyRanker,
              scorer = st.RankDifference())

In [74]:
html = st.produce_four_square_explorer(four_square=four_square,
                                       x_label='Pos-Neg',
                                       y_label='Accept-Reject',
                                       num_terms_semiotic_square=10,
                                       minimum_term_frequency=10,
                                       pmi_threshold_coefficient=10,
                                       term_ranker=st.OncePerDocFrequencyRanker,
                                       metadata=(fine_grain_corpus_compact._df['category'] + ': '
                                                 + fine_grain_corpus_compact._df.rating + ', '
                                                 + fine_grain_corpus_compact._df['title']))


In [75]:
file_name = 'four_square.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)

In [59]:
axes = four_square.get_axes()

In [61]:
axes.sort_values(by='x')

Unnamed: 0_level_0,x,y,counts
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
not well,-0.060523,-0.060523,23
observations,-0.056245,-0.056245,52
case for,-0.054141,-0.054141,14
it ’s,-0.053300,-0.053300,40
doing,-0.046216,-0.046216,53
networks as,-0.043832,-0.043832,15
here the,-0.043551,-0.043551,19
be the,-0.042359,-0.042359,62
from table,-0.040466,-0.040466,11
natural language,-0.040466,-0.040466,13


In [142]:
tdf = corpus.get_term_freq_df()
tdf['sfs'] = ScaledFScorePresets(beta = 1).get_scores(tdf['Reject freq'], tdf['Accept freq'])
tdf['sfs_p'] = ScaledFScorePresets(beta = 1, priors=priors).get_scores(tdf['Reject freq'], tdf['Accept freq'])
tdf['loridp'] = st.LogOddsRatioInformativeDirichletPrior(priors, reviews_df.parse.apply(len).mean(), 'word').get_scores(tdf['Reject freq'], tdf['Accept freq'])
tdf['rankdiff'] = st.RankDifference().get_scores(tdf['Reject freq'], tdf['Accept freq'])
pd.DataFrame(
    {s:tdf.sort_values(by=s, ascending=False).iloc[::].index
     for s in ['sfs', 'sfs_p', 'loridp', 'rankdiff']
    }
).iloc[:10]

Unnamed: 0,loridp,rankdiff,sfs,sfs_p
0,_,_ _,_ _,_ _
1,$ $,_,_,_
2,dialog,novelty,time series,time series
3,medical,i do,autoencoder,autoencoder
4,word2vec,layers,series,reconstruction
5,mutual,graph,connections,series
6,mutual information,limited,reconstruction,$ $
7,_ _,claim,novelty,connections
8,mi,class,$ $,classes
9,auto encoders,is no,classes,novelty


In [None]:
reviews_df = pd.read_csv('https://github.com/JasonKessler/ICLR18ReviewVis/raw/master/iclr2018_reviews.csv.bz2')
reviews_df['parse'] = reviews_df['review'].apply(spacy.load('en', parser=False))

# Create Corpus based on accept/reject/workshop decision
# A two-category corpus to use for plotting, with unigrams which only occur in bigrams removed.
# Terms used in <5 documents are removed as well.
full_corpus = (
    st.CorpusFromParsedDocuments(reviews_df, category_col='decision', parsed_col='parse')
    .build().remove_categories(['Workshop'])    
    .compact(st.CompactTerms(st.TermCompactor, minimum_term_count=6))    
)


# Use counts of unigrams and bigrams from the Workshop corpus as the Dirichlet prior
priors = (st.PriorFactory(full_corpus, term_ranker=st.OncePerDocFrequencyRanker)
          .use_categories(['Workshop'].align_to_target(corpus).get_priors()))
term_scorer = LogOddsRatioInformativeDirichletPrior(
          priors, reviews_df.parse.apply(len).mean(), 'word') # use the original approach to scaling prior
 
html = st.produce_frequency_explorer(corpus, 
  category='Accept', not_categories=['Reject'],
  term_ranker = st.OncePerDocFrequencyRanker,
  term_scorer = term_scorer,
  grey_threshold = 1.96,
  metadata = corpus.get_df()['metadata'])

In [56]:
file_name = 'accept_reject_loridp.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)

6131293

In [146]:
html = st.produce_frequency_explorer(compact_corpus, 
                                     category='Accept', 
                                     not_categories=['Reject'],
                                     term_ranker = st.OncePerDocFrequencyRanker,
                                     term_scorer = st.RankDifference(),
                                     grey_threshold = 0,                                     
                                     metadata = (corpus._df['title'] 
                                                 + '<br/>Score: ' + corpus._df['rating'].apply(lambda x: x.split(':')[0]) + '/10'
                                                 + '<br/>Confidence: ' + corpus._df['confidence'].apply(lambda x: x.split(':')[0]) + '/5'))
file_name = 'accept_reject_rankdiff.html'
open(file_name, 'wb').write(html.encode('utf-8'))

8424305

In [45]:
four_square_corpus_phrases = (st.CorpusFromParsedDocuments(reviews_df, category_col = 'category', parsed_col = 'parse',
                                                          feats_from_spacy_doc=st.PhraseMachinePhrases())
                              .build().compact(st.ClassPercentageCompactor(term_count=1)))

In [48]:
four_square_axes = st.FourSquareAxes(four_square_corpus_phrases, 
                                     left_categories=['Accept, Positive'], 
                                     right_categories=['Accept, Negative'], 
                                     top_categories=['Reject, Positive'], 
                                     bottom_categories=['Reject, Negative'], 
                                     labels = {'a': 'Positive',
                                               'b': 'Review that was Contrary to Accpetance Decision',
                                               'not_a': 'Negative',
                                               'not_b': 'Review that in Line With Acceptance Decision'},
                                     term_ranker=st.OncePerDocFrequencyRanker)
html = st.produce_four_square_axes_explorer(
    four_square_axes=four_square_axes,
    x_label="Accepts: Pos-Neg",
    y_label='Rejects: Neg-Pos',
    use_full_doc=True,
    pmi_threshold_coefficient=0,
    censor_points=False,
    metadata=four_square_corpus_phrases.get_df()['metadata'],
    color_func='(function(d) {return d3.rgb(230, 220, 230)})',
)
file_name = '../jasonkessler.github.io/iclr2018reviews/accept_reject_four_square_axes_phrases.html'
open(file_name, 'wb').write(html.encode('utf-8'))
#IFrame(src=file_name, width = 1500, height=700)

7409359

$$ \log_2 \frac{P(\mbox{word1" "word2})}{P(\mbox{word1}) \times P(\mbox{word2})} > 2 * \mbox{pmi_threshold_coefficient}$$