{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this document, we will use SVM to help to classify the category of 20 group news dataset. \n",
    "Though evaluation, it is obviouse that SVM is mode accurate than the naive bayes classifier on this dataset.\n",
    "\n",
    "The first some steps are same with the one in the naive bayes classifier: [Naive Bayes classification on 20newsgroups dataset]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']\n"
     ]
    }
   ],
   "source": [
    "from sklearn.datasets import fetch_20newsgroups\n",
    "categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']\n",
    "twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)\n",
    "print twenty_train.target_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "count_vect = CountVectorizer()\n",
    "X_train_counts = count_vect.fit_transform(twenty_train.data)\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "tfidf_transformer = TfidfTransformer()\n",
    "X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2 1]\n",
      "'Abuse of antibiotics is very common' => sci.med\n",
      "'OpenGL on the GPU is fast' => comp.graphics\n"
     ]
    }
   ],
   "source": [
    "from sklearn import svm\n",
    "clf = svm.SVC(kernel = 'linear')\n",
    "clf.fit(X_train_tfidf, twenty_train.target)\n",
    "\n",
    "\n",
    "# Next, we will write two sentences to test the model.\n",
    "docs_new = ['Abuse of antibiotics is very common', 'OpenGL on the GPU is fast']\n",
    "X_new_counts = count_vect.transform(docs_new)\n",
    "X_new_tfidf = tfidf_transformer.transform(X_new_counts)\n",
    "\n",
    "# the following code will show the category pridicted by the model\n",
    "predicted = clf.predict(X_new_tfidf)\n",
    "print predicted\n",
    "for doc, category in zip(docs_new, predicted):\n",
    "    print('%r => %s' % (doc, twenty_train.target_names[category]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                        precision    recall  f1-score   support\n",
      "\n",
      "           alt.atheism       0.96      0.83      0.89       319\n",
      "         comp.graphics       0.90      0.96      0.93       389\n",
      "               sci.med       0.94      0.91      0.93       396\n",
      "soc.religion.christian       0.89      0.96      0.93       398\n",
      "\n",
      "           avg / total       0.92      0.92      0.92      1502\n",
      "\n",
      "accurary\t0.920772303595\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "import numpy as np;\n",
    "\n",
    "# get the test data from test dataset\n",
    "twenty_test = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42)\n",
    "docs_test = twenty_test.data\n",
    "#vectorize test data\n",
    "X_test_counts = count_vect.transform(docs_test)\n",
    "#extract feature of test data\n",
    "X_test_tfidf = tfidf_transformer.transform(X_test_counts)\n",
    "# use the model to predict the category \n",
    "predicted = clf.predict(X_test_tfidf)\n",
    "\n",
    "#get the precision, recall, f1-score and support of this model\n",
    "print(metrics.classification_report(twenty_test.target, predicted,target_names=twenty_test.target_names))\n",
    "#get the accuracy of the model\n",
    "print(\"accurary\\t\"+str(np.mean(predicted == twenty_test.target)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can see the accuracy is 0.92, much higher than naive bayes classifier. (0.83)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}