In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
fetch_subset = lambda subset: fetch_20newsgroups(
    subset=subset, categories=categories,
    shuffle=True, random_state=42,
    remove=('headers', 'footers', 'quotes'))
train = fetch_subset('train')
test = fetch_subset('test')

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer()
clf = LogisticRegressionCV()
pipeline = Pipeline([('vec', vec), ('clf', clf)])
pipeline.fit(train['data'], train['target'])

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])

In [12]:
import eli5
from eli5 import explain_weights, explain_prediction
from eli5.formatters import format_as_html, format_as_text, format_html_styles, fields

# print(format_as_text(explain_weights(clf, vec, target_names=train['target_names'])))

In [4]:
from IPython.core.display import display, HTML
show_html = lambda html: display(HTML(html))
show_html_expl = lambda expl, **kwargs: show_html(format_as_html(expl, include_styles=False, **kwargs))
show_html(format_html_styles())

In [5]:
eli5.show_weights(clf, vec=vec, target_names=train['target_names'], horizontal_layout=False)

Weight?,Feature
+18.117,atheism
+16.558,atheists
+14.393,religion
+14.380,bobby
+14.325,matthew
+13.389,motto
+13.215,atheist
+13.010,islam
+12.800,nanci
+12.216,enviroleague

Weight?,Feature
+25.897,graphics
+18.957,image
+17.298,computer
+16.843,3d
+16.190,file
+14.020,points
+13.269,sgi
+13.180,42
+12.428,hi
+11.835,3do

Weight?,Feature
+35.983,space
+17.907,orbit
+15.269,nasa
+15.173,launch
+13.235,spacecraft
+12.872,mars
+12.369,nick
+12.117,moon
+12.064,allen
+11.800,shuttle

Weight?,Feature
+19.215,christian
+16.667,blood
+14.907,fbi
+14.185,christians
+12.783,hudson
+12.746,order
+12.338,christ
+12.126,ekr
+11.972,terrorist
+11.608,koresh


In [6]:
show_html_expl(
    explain_prediction(clf, test['data'][2], vec, target_names=train['target_names']),
    force_weights=False, horizontal_layout=True)

Contribution?,Feature
-1.394,<BIAS>
-14.777,Highlighted in text (sum)

Contribution?,Feature
9.631,Highlighted in text (sum)
-1.015,<BIAS>

Contribution?,Feature
-1.016,<BIAS>
-5.808,Highlighted in text (sum)

Contribution?,Feature
-1.019,<BIAS>
-10.865,Highlighted in text (sum)


``dense_multitarget=True`` is supported for prediction explanations too, and shows just the top prediction highlighting.

In [7]:
show_html_expl(explain_prediction(clf, test['data'][2], vec, target_names=train['target_names']),
               force_weights=True)

Contribution?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Contribution?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Contribution?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Contribution?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+0.889,some,,
+0.539,much,,
+0.278,is,,
+0.266,which,,
+0.225,designer,,
+0.218,it,,
+0.161,most,,
+0.107,trying,,
-0.005,interior,,
-0.009,has,,

Contribution?,Feature
0.889,some
0.539,much
0.278,is
0.266,which
0.225,designer
0.218,it
0.161,most
0.107,trying
-0.005,interior
-0.009,has

Contribution?,Feature
3.12,graphics
2.661,software
1.707,hi
1.18,looking
1.127,buy
0.906,features
0.85,pc
0.672,help
0.53,any
0.52,it

Contribution?,Feature
0.87,costs
0.637,buy
0.606,software
0.5,most
0.393,the
0.287,on
0.281,some
0.259,better
0.249,likes
0.241,sophisticated

Contribution?,Feature
2.181,he
0.528,my
0.481,more
0.345,and
0.313,friend
0.287,suggestion
0.228,trying
0.137,find
0.11,here
0.104,from

Contribution?,Feature
-1.394,<BIAS>
-14.777,Highlighted in text (sum)

Contribution?,Feature
9.631,Highlighted in text (sum)
-1.015,<BIAS>

Contribution?,Feature
-1.016,<BIAS>
-5.808,Highlighted in text (sum)

Contribution?,Feature
-1.019,<BIAS>
-10.865,Highlighted in text (sum)


We can hide weights by passing ``force_weights=False`` (they still will be shown if it's impossible to highlight text)

In [8]:
show_html_expl(explain_prediction(clf, test['data'][4], vec, target_names=train['target_names']), force_weights=False)

Contribution?,Feature
-1.394,<BIAS>
-6.122,Highlighted in text (sum)

Contribution?,Feature
7.447,Highlighted in text (sum)
-1.015,<BIAS>

Contribution?,Feature
-1.016,<BIAS>
-9.098,Highlighted in text (sum)

Contribution?,Feature
-1.019,<BIAS>
-10.662,Highlighted in text (sum)


Show explanations for the winning class for first 10 documents from test data

In [9]:
import numpy as np
for doc in test['data'][:10]:
    expl = explain_prediction(clf, doc, vec, target_names=train['target_names'], top_targets=1)
    show_html_expl(expl, force_weights=False)

Contribution?,Feature
6.073,Highlighted in text (sum)
-1.016,<BIAS>


Contribution?,Feature
7.208,Highlighted in text (sum)
-1.015,<BIAS>


Contribution?,Feature
9.631,Highlighted in text (sum)
-1.015,<BIAS>


Contribution?,Feature
4.294,Highlighted in text (sum)
-1.015,<BIAS>


Contribution?,Feature
7.447,Highlighted in text (sum)
-1.015,<BIAS>


Contribution?,Feature
1.509,Highlighted in text (sum)
-1.015,<BIAS>


Contribution?,Feature
0.019,Highlighted in text (sum)
-1.015,<BIAS>


Contribution?,Feature
11.409,Highlighted in text (sum)
-1.016,<BIAS>


Contribution?,Feature
10.319,Highlighted in text (sum)
-1.394,<BIAS>


Contribution?,Feature
0.436,Highlighted in text (sum)
-1.016,<BIAS>


Now use a vectorizer that skips stopwords

In [10]:
vec_stop = TfidfVectorizer(stop_words='english')
clf_stop = LogisticRegressionCV()
pipeline_stop = Pipeline([('vec', vec_stop), ('clf', clf_stop)])
pipeline_stop.fit(train['data'], train['target'])

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])

Words such as "the", "in", "of" are not used as features and are not highlighted

In [11]:
show_html_expl(explain_prediction(clf_stop, test['data'][4], vec_stop, target_names=train['target_names']), force_weights=False)

Contribution?,Feature
-1.395,<BIAS>
-6.399,Highlighted in text (sum)

Contribution?,Feature
7.011,Highlighted in text (sum)
-1.018,<BIAS>

Contribution?,Feature
-1.017,<BIAS>
-6.675,Highlighted in text (sum)

Contribution?,Feature
-1.07,<BIAS>
-9.294,Highlighted in text (sum)
