{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.20.3\n", "0.19.1\n", "1.14.2\n" ] } ], "source": [ "# Imports\n", "\n", "import os\n", "import lime\n", "import sklearn\n", "import sklearn.ensemble\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.datasets import fetch_20newsgroups\n", "\n", "print pd.__version__\n", "print sklearn.__version__\n", "print np.__version__" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n" ] } ], "source": [ "# lists out all the classes possible in the newsgroup dataset\n", "newsgroups_train = fetch_20newsgroups(subset='train')\n", "print newsgroups_train.target_names" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['description', 'DESCR', 'filenames', 'target_names', 'data', 'target']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we will be exploring \"comp.sys.ibm.pc.hardware\", \"comp.sys.mac.hardware\"\n", "categories = [\"comp.sys.ibm.pc.hardware\", \"comp.sys.mac.hardware\"]\n", "newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)\n", "newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)\n", "\n", "newsgroups_train.keys()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "From: blakey@ug.cs.dal.ca (Jason \"Fish\" Blakey)\n", "Subject: Newlife 25 and hard drives\n", "Nntp-Posting-Host: ug.cs.dal.ca\n", "Organization: Math, Stats & CS, Dalhousie University, Halifax, NS, Canada\n", "Lines: 12\n", "\n", " Giday netters! Just got a used Newlife 25 accelerator, with FPU, and i \n", "was wondering about a few points. \n", "-Anyone know the current driver version for it??\n", "-Can it handle the 16-bit grayscale card, if i get the video option\n", "-Why would it be hating my hard drive?(can't use the accelerator and \n", "\thard drive at the same time). Do i need a new driver on my drive?\n", "\tWhat make?\n", "-Thanks,\n", "\tJason\n", "-- \n", " ............................................................................ \n", " blakey@ug.cs.dal.ca -> He's big! He's purple! He's your best friend!\n", "\n", "++++++++++++++++++\n", "comp.sys.ibm.pc.hardware\n" ] } ], "source": [ "# Sample\n", "print newsgroups_test.get('data')[0]\n", "print \"++++++++++++++++++\"\n", "print newsgroups_test.get('target_names')[0]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# we will be vectorizing the text using TF-IDF vectorization technique\n", "# we will discuss this TF-IDF in future as part of this challenge itself;\n", "vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False)\n", "train_vectors = vectorizer.fit_transform(newsgroups_train.data)\n", "test_vectors = vectorizer.transform(newsgroups_test.data)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "((1168, 21486), (777, 21486))" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_vectors.shape, test_vectors.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Random Forest" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,\n", " oob_score=False, random_state=None, verbose=0,\n", " warm_start=False)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# model 1\n", "rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)\n", "rf.fit(train_vectors, newsgroups_train.target)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.8854568854568855" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# do prediction\n", "pred = rf.predict(test_vectors)\n", "sklearn.metrics.accuracy_score(newsgroups_test.target, pred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Naive Bayes" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# model 2\n", "from sklearn.naive_bayes import MultinomialNB\n", "nb = MultinomialNB()\n", "nb.fit(train_vectors, newsgroups_train.target)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.9124839124839125" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# do prediction\n", "pred = nb.predict(test_vectors)\n", "sklearn.metrics.accuracy_score(newsgroups_test.target, pred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Logistic Classifier" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", " verbose=0, warm_start=False)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# model 3\n", "from sklearn.linear_model import LogisticRegression\n", "lr = LogisticRegression()\n", "lr.fit(train_vectors, newsgroups_train.target)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.8867438867438867" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# do prediction\n", "pred = lr.predict(test_vectors)\n", "sklearn.metrics.accuracy_score(newsgroups_test.target, pred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lime in Action" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from lime import lime_text\n", "from sklearn.pipeline import make_pipeline\n", "\n", "crf = make_pipeline(vectorizer, rf)\n", "cnb = make_pipeline(vectorizer, nb)\n", "clr = make_pipeline(vectorizer, lr)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from lime.lime_text import LimeTextExplainer\n", "explainer = LimeTextExplainer(class_names=['ibm', 'mac'])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# picking on random example from the test dataset; and seeing the top 6 features learnt by each different\n", "# classifier for predicting the actual class of the example data point\n", "idx = np.random.randint(1, len(newsgroups_test.data))\n", "exp_crf = explainer.explain_instance(newsgroups_test.data[idx], crf.predict_proba, num_features=6)\n", "exp_clr = explainer.explain_instance(newsgroups_test.data[idx], clr.predict_proba, num_features=6)\n", "exp_cnb = explainer.explain_instance(newsgroups_test.data[idx], cnb.predict_proba, num_features=6)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", " \n", "
\n", " \n", " \n", " \n", " " ], "text/plain": [ "