{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Classifying News Headlines and Explaining the Result\n", "Reference: Classifying News Headlines and Explaining the Result from [Kaggle](http://nbviewer.jupyter.org/github/dreamgonfly/lime-examples/blob/master/Classifying%20News%20Headlines%20and%20Explaining%20the%20Result.ipynb)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "# using Kaggle API https://github.com/Kaggle/kaggle-api\n", "DATA_FILE = \"~/.kaggle/datasets/uciml/news-aggregator-dataset/uci-news-aggregator.csv\"\n", "news = pd.read_csv(DATA_FILE).sample(frac=0.1)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "42242" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(news)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDTITLEURLPUBLISHERCATEGORYSTORYHOSTNAMETIMESTAMP
1352913530Robotic fish designed to perform escape maneuv...http://www.ecnmag.com/news/2014/03/robotic-fis...ECNmag.comtdSmJK-WR4xv2inMKmnmxaRfd6cf1Mwww.ecnmag.com1395059947658
254251254697Faces & names: 'X-Men' climbs to $302 million ...http://www.duluthnewstribune.com/content/faces...Duluth News Tribuneed5poaO2w8Yffx6MDgPRQSF5POXCXMwww.duluthnewstribune.com1401174011596
2778527786Which 'Divergent' Starlet Skipped Underwear fo...http://www.cambio.com/2014/03/19/which-diverge...Cambioed55mX4D4wN3d5vMMYF9GgviF21QlMwww.cambio.com1395333837043
\n", "
" ], "text/plain": [ " ID TITLE \\\n", "13529 13530 Robotic fish designed to perform escape maneuv... \n", "254251 254697 Faces & names: 'X-Men' climbs to $302 million ... \n", "27785 27786 Which 'Divergent' Starlet Skipped Underwear fo... \n", "\n", " URL \\\n", "13529 http://www.ecnmag.com/news/2014/03/robotic-fis... \n", "254251 http://www.duluthnewstribune.com/content/faces... \n", "27785 http://www.cambio.com/2014/03/19/which-diverge... \n", "\n", " PUBLISHER CATEGORY STORY \\\n", "13529 ECNmag.com t dSmJK-WR4xv2inMKmnmxaRfd6cf1M \n", "254251 Duluth News Tribune e d5poaO2w8Yffx6MDgPRQSF5POXCXM \n", "27785 Cambio e d55mX4D4wN3d5vMMYF9GgviF21QlM \n", "\n", " HOSTNAME TIMESTAMP \n", "13529 www.ecnmag.com 1395059947658 \n", "254251 www.duluthnewstribune.com 1401174011596 \n", "27785 www.cambio.com 1395333837043 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "news.head(3)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.preprocessing import LabelEncoder\n", "encoder = LabelEncoder()\n", "\n", "X = news['TITLE']\n", "y = encoder.fit_transform(news['CATEGORY'])\n", "\n", "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, y)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0., 0., 0., 1.],\n", " [0., 1., 0., 0.],\n", " [0., 1., 0., 0.],\n", " ...,\n", " [0., 1., 0., 0.],\n", " [0., 0., 1., 0.],\n", " [0., 0., 0., 1.]])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# You can also make it categorical ont-hot vector.\n", "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n", "OneHotEncoder().fit_transform(\n", " LabelEncoder().fit_transform(news['CATEGORY']).reshape(-1,1)\n", ").toarray()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "31681" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(X_train)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10561" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(X_test)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pandas.core.series.Series" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(X_train)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "155319 Facebook: Mobile Powers Growth\n", "230004 Stocks rise ahead of Fed minutes; Dow jumps 10...\n", "91687 GM Expected to Announce Major Investment in 20...\n", "Name: TITLE, dtype: object" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.head(3)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "vectorizer = CountVectorizer(min_df=3)\n", "\n", "train_vectors = vectorizer.fit_transform(X_train)\n", "test_vectors = vectorizer.transform(X_test)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<31681x9886 sparse matrix of type ''\n", "\twith 267205 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_vectors" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Stocks rise ahead of Fed minutes; Dow jumps 100 points'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.iloc[1]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<1x9886 sparse matrix of type ''\n", "\twith 10 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_vectors[1]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "scipy.sparse.csr.csr_matrix" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(train_vectors)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, ..., 0, 0, 0]])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# one-hot vector\n", "train_vectors[1].toarray()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Decision Tree" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", "%load_ext autotime" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n", " max_features=None, max_leaf_nodes=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n", " splitter='best')" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "time: 5.64 s\n" ] } ], "source": [ "from sklearn import tree\n", "dt = tree.DecisionTreeClassifier()\n", "dt.fit(train_vectors, y_train)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8092983618975476" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "time: 13.9 ms\n" ] } ], "source": [ "pred = dt.predict(test_vectors)\n", "accuracy_score(y_test, pred, )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Random Forest" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,\n", " oob_score=False, random_state=None, verbose=0,\n", " warm_start=False)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "time: 8.03 s\n" ] } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "rf = RandomForestClassifier(n_estimators=20)\n", "rf.fit(train_vectors, y_train)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8496354511883344" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "time: 247 ms\n" ] } ], "source": [ "pred = rf.predict(test_vectors)\n", "accuracy_score(y_test, pred, )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Multinomial Naive Bayes" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "time: 20.6 ms\n" ] } ], "source": [ "from sklearn.naive_bayes import MultinomialNB\n", "nb = MultinomialNB()\n", "nb.fit(train_vectors, y_train)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9051226209639238" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "time: 7.33 ms\n" ] } ], "source": [ "pred = nb.predict(test_vectors)\n", "accuracy_score(y_test, pred, )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Explaining the result" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%unload_ext autotime" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.pipeline import make_pipeline\n", "c = make_pipeline(vectorizer, nb)\n", "\n", "from lime.lime_text import LimeTextExplainer\n", "explainer = LimeTextExplainer(class_names=list(encoder.classes_))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Henry Cavill Is Still Super Handsome (But Way More Serious) in the First Official ...'" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# .sample is random select\n", "example = X_test.sample(1).iloc[0]\n", "example" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[8.54676048e-04, 9.00036923e-01, 4.36873332e-02, 5.54210682e-02]])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "c.predict_proba([example])" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%capture\n", "exp = explainer.explain_instance(example, c.predict_proba, top_labels=2)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", "
\n", " \n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "exp.show_in_notebook()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 1 }