{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Importing the libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to\n",
      "[nltk_data]     C:\\Users\\dan\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package wordnet to\n",
      "[nltk_data]     C:\\Users\\dan\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n",
      "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
      "[nltk_data]     C:\\Users\\dan\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package averaged_perceptron_tagger is already up-to-\n",
      "[nltk_data]       date!\n",
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     C:\\Users\\dan\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from nltk import pos_tag\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from nltk.corpus import wordnet as wn\n",
    "import nltk\n",
    "import tqdm\n",
    "nltk.download('punkt')\n",
    "nltk.download('wordnet')\n",
    "nltk.download('averaged_perceptron_tagger')\n",
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn import model_selection, naive_bayes, svm\n",
    "from sklearn.metrics import accuracy_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set the random seed, in order to guarentee reproducability across runs (consistency of results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Set Random seed\n",
    "np.random.seed(7)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Loading the corpus\n",
    "10'000 reviews. \\_\\_label\\_\\_1 are negative reviews, \\_\\_label\\_\\_2 are positive reviews."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Corpus = pd.read_csv(\"corpus.csv\",encoding='latin-1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "(10000, 2)\n",
      "                                                text        label\n",
      "0   Stuning even for the non-gamer: This sound tr...  __label__2 \n",
      "1   The best soundtrack ever to anything.: I'm re...  __label__2 \n",
      "2   Amazing!: This soundtrack is my favorite musi...  __label__2 \n",
      "3   Excellent Soundtrack: I truly like this sound...  __label__2 \n",
      "4   Remember, Pull Your Jaw Off The Floor After H...  __label__2 \n",
      "5   an absolute masterpiece: I am quite sure any ...  __label__2 \n",
      "6   Buyer beware: This is a self-published book, ...  __label__1 \n",
      "7   Glorious story: I loved Whisper of the wicked...  __label__2 \n",
      "8   A FIVE STAR BOOK: I just finished reading Whi...  __label__2 \n",
      "9   Whispers of the Wicked Saints: This was a eas...  __label__2 \n",
      "(1000, 2)\n"
     ]
    }
   ],
   "source": [
    "print(type(Corpus))\n",
    "print(Corpus.shape)\n",
    "print(Corpus[:10])\n",
    "\n",
    "Corpus = Corpus[:1000]\n",
    "print(Corpus.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Data Pre-processing\n",
    "This will help in getting better results through the classification algorithms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1000, 2)\n",
      "(1000, 2)\n"
     ]
    }
   ],
   "source": [
    "# Step - 1a : Remove blank rows if any.\n",
    "print(Corpus.shape)\n",
    "Corpus['text'].dropna(inplace=True)\n",
    "print(Corpus.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Step - 1b : Change all the text to lower case. This is a normal procedure as it helps \"normalizing\" the text.\n",
    "Corpus['text'] = [entry.lower() for entry in Corpus['text']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Step - 1c : Tokenization : Each sample (text chunk) from the corpus is broken into a set of words\n",
    "Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.\n",
    "\n",
    "# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. \n",
    "#By default it is set to Noun\n",
    "tag_map = defaultdict(lambda : wn.NOUN)\n",
    "tag_map['J'] = wn.ADJ\n",
    "tag_map['V'] = wn.VERB\n",
    "tag_map['R'] = wn.ADV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['stuning', 'even', 'for', 'the', 'non-gamer', ':', 'this', 'sound', 'track', 'was', 'beautiful', '!', 'it', 'paints', 'the', 'senery', 'in', 'your', 'mind', 'so', 'well', 'i', 'would', 'recomend', 'it', 'even', 'to', 'people', 'who', 'hate', 'video', 'game', 'music', '!', 'i', 'have', 'played', 'the', 'game', 'chrono', 'cross', 'but', 'out', 'of', 'all', 'of', 'the', 'games', 'i', 'have', 'ever', 'played', 'it', 'has', 'the', 'best', 'music', '!', 'it', 'backs', 'away', 'from', 'crude', 'keyboarding', 'and', 'takes', 'a', 'fresher', 'step', 'with', 'grate', 'guitars', 'and', 'soulful', 'orchestras', '.', 'it', 'would', 'impress', 'anyone', 'who', 'cares', 'to', 'listen', '!', '^_^']\n"
     ]
    }
   ],
   "source": [
    "first_sample = Corpus['text'][0]\n",
    "print(first_sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('stuning', 'VBG'), ('even', 'RB'), ('for', 'IN'), ('the', 'DT'), ('non-gamer', 'JJ'), (':', ':'), ('this', 'DT'), ('sound', 'NN'), ('track', 'NN'), ('was', 'VBD'), ('beautiful', 'JJ'), ('!', '.'), ('it', 'PRP'), ('paints', 'VBZ'), ('the', 'DT'), ('senery', 'NN'), ('in', 'IN'), ('your', 'PRP$'), ('mind', 'NN'), ('so', 'RB'), ('well', 'RB'), ('i', 'VB'), ('would', 'MD'), ('recomend', 'VB'), ('it', 'PRP'), ('even', 'RB'), ('to', 'TO'), ('people', 'NNS'), ('who', 'WP'), ('hate', 'VBP'), ('video', 'NNS'), ('game', 'NN'), ('music', 'NN'), ('!', '.'), ('i', 'NN'), ('have', 'VBP'), ('played', 'VBN'), ('the', 'DT'), ('game', 'NN'), ('chrono', 'NN'), ('cross', 'NN'), ('but', 'CC'), ('out', 'IN'), ('of', 'IN'), ('all', 'DT'), ('of', 'IN'), ('the', 'DT'), ('games', 'NNS'), ('i', 'VBP'), ('have', 'VBP'), ('ever', 'RB'), ('played', 'VBN'), ('it', 'PRP'), ('has', 'VBZ'), ('the', 'DT'), ('best', 'JJS'), ('music', 'NN'), ('!', '.'), ('it', 'PRP'), ('backs', 'VBZ'), ('away', 'RB'), ('from', 'IN'), ('crude', 'NN'), ('keyboarding', 'NN'), ('and', 'CC'), ('takes', 'VBZ'), ('a', 'DT'), ('fresher', 'JJ'), ('step', 'NN'), ('with', 'IN'), ('grate', 'JJ'), ('guitars', 'NNS'), ('and', 'CC'), ('soulful', 'JJ'), ('orchestras', 'NNS'), ('.', '.'), ('it', 'PRP'), ('would', 'MD'), ('impress', 'VB'), ('anyone', 'NN'), ('who', 'WP'), ('cares', 'VBZ'), ('to', 'TO'), ('listen', 'VB'), ('!', '.'), ('^_^', 'NN')]\n",
      "text_final ['stun', 'even', 'sound', 'track', 'beautiful', 'paint', 'senery', 'mind', 'well', 'would', 'recomend', 'even', 'people', 'hate', 'video', 'game', 'music', 'play', 'game', 'chrono', 'cross', 'game', 'ever', 'play', 'best', 'music', 'back', 'away', 'crude', 'keyboarding', 'take', 'fresh', 'step', 'grate', 'guitar', 'soulful', 'orchestra', 'would', 'impress', 'anyone', 'care', 'listen']\n"
     ]
    }
   ],
   "source": [
    "Final_words=[]\n",
    "# Initializing WordNetLemmatizer()\n",
    "word_Lemmatized = WordNetLemmatizer()\n",
    "# pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.\n",
    "pos_tag_result = pos_tag(first_sample)\n",
    "print(pos_tag_result)\n",
    "for word, tag in pos_tag_result:\n",
    "    # Below condition is to check for Stop words and consider only alphabets\n",
    "    if word not in stopwords.words('english') and word.isalpha():\n",
    "        word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])\n",
    "        Final_words.append(word_Final)\n",
    "# The final processed set of words for each iteration will be stored in 'text_final'\n",
    "print('text_final',str(Final_words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Check if the dataset was already processed before, as it takes several minutes. This is a good practice."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "import os\n",
    "import os.path\n",
    "if os.path.isfile(\"processed_corpus.pickle\"):\n",
    "    with open('processed_corpus.pickle', 'rb') as f:\n",
    "        Corpus = pickle.load(f)\n",
    "else:\n",
    "    for index,entry in enumerate(tqdm.tqdm(Corpus['text'])):\n",
    "        # Declaring Empty List to store the words that follow the rules for this step\n",
    "        Final_words = []\n",
    "        # Initializing WordNetLemmatizer()\n",
    "        word_Lemmatized = WordNetLemmatizer()\n",
    "        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.\n",
    "        for word, tag in pos_tag(entry):\n",
    "            # Below condition is to check for Stop words and consider only alphabets\n",
    "            if word not in stopwords.words('english') and word.isalpha():\n",
    "                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])\n",
    "                Final_words.append(word_Final)\n",
    "        # The final processed set of words for each iteration will be stored in 'text_final'\n",
    "        Corpus.loc[index,'text_final'] = str(Final_words)\n",
    "        \n",
    "    with open(\"processed_corpus.pickle\", \"wb\") as f:\n",
    "        pickle.dump(Corpus, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    ['stun', 'even', 'sound', 'track', 'beautiful'...\n",
      "1    ['best', 'soundtrack', 'ever', 'anything', 're...\n",
      "2    ['amaze', 'soundtrack', 'favorite', 'music', '...\n",
      "3    ['excellent', 'soundtrack', 'truly', 'like', '...\n",
      "4    ['remember', 'pull', 'jaw', 'floor', 'hear', '...\n",
      "Name: text_final, dtype: object\n"
     ]
    }
   ],
   "source": [
    "print(Corpus['text_final'].head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                text        label  \\\n",
      "0  [stuning, even, for, the, non-gamer, :, this, ...  __label__2    \n",
      "1  [the, best, soundtrack, ever, to, anything, .,...  __label__2    \n",
      "2  [amazing, !, :, this, soundtrack, is, my, favo...  __label__2    \n",
      "3  [excellent, soundtrack, :, i, truly, like, thi...  __label__2    \n",
      "4  [remember, ,, pull, your, jaw, off, the, floor...  __label__2    \n",
      "\n",
      "                                          text_final  \n",
      "0  ['stun', 'even', 'sound', 'track', 'beautiful'...  \n",
      "1  ['best', 'soundtrack', 'ever', 'anything', 're...  \n",
      "2  ['amaze', 'soundtrack', 'favorite', 'music', '...  \n",
      "3  ['excellent', 'soundtrack', 'truly', 'like', '...  \n",
      "4  ['remember', 'pull', 'jaw', 'floor', 'hear', '...  \n"
     ]
    }
   ],
   "source": [
    "print(Corpus[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Step - 2: Split the model into Train and Test Data set\n",
    "Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Step - 3: Label encode the target variable\n",
    "#This is done to transform Categorical data of string type in the data set into numerical values\n",
    "Encoder = LabelEncoder()\n",
    "Train_Y = Encoder.fit_transform(Train_Y)\n",
    "Test_Y = Encoder.fit_transform(Test_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Step - 4: Vectorize the words by using TF-IDF Vectorizer\n",
    "#This is done to find how important a word in document is in comaprison to the corpus\n",
    "Tfidf_vect = TfidfVectorizer(max_features=5000)\n",
    "Tfidf_vect.fit(Corpus['text_final'])\n",
    "\n",
    "Train_X_Tfidf = Tfidf_vect.transform(Train_X)\n",
    "Test_X_Tfidf = Tfidf_vect.transform(Test_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Naive Bayes Accuracy Score ->  81.33333333333333\n"
     ]
    }
   ],
   "source": [
    "# Step - 5: Run different algorithms to classify our data and check their accuracy\n",
    "\n",
    "# Classifier - Algorithm - Naive Bayes\n",
    "# fit the training dataset on the classifier\n",
    "Naive = naive_bayes.MultinomialNB()\n",
    "Naive.fit(Train_X_Tfidf,Train_Y)\n",
    "\n",
    "# predict the labels on validation dataset\n",
    "predictions_NB = Naive.predict(Test_X_Tfidf)\n",
    "\n",
    "# Use accuracy_score function to get the accuracy\n",
    "print(\"Naive Bayes Accuracy Score -> \",accuracy_score(predictions_NB, Test_Y)*100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SVM Accuracy Score ->  80.0\n",
      "[(1, 0.13771564364451283), (1, 0.6901297920139712), (0, -0.1981896618752993), (0, -0.4480398990108602), (0, -0.778919669893617)]\n"
     ]
    }
   ],
   "source": [
    "# Classifier - Algorithm - SVM\n",
    "# fit the training dataset on the classifier\n",
    "SVM = svm.LinearSVC()\n",
    "SVM.fit(Train_X_Tfidf,Train_Y)\n",
    "\n",
    "# predict the labels on validation dataset\n",
    "predictions_SVM = SVM.predict(Test_X_Tfidf)\n",
    "distances = SVM.decision_function(Test_X_Tfidf)\n",
    "\n",
    "# Use accuracy_score function to get the accuracy\n",
    "print(\"SVM Accuracy Score -> \",accuracy_score(predictions_SVM, Test_Y)*100)\n",
    "print(list(zip(predictions_SVM,distances))[:5])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Code largely based on https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=5000, min_df=1,\n",
       "        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,\n",
       "        stop_words=None, strip_accents=None, sublinear_tf=False,\n",
       "        token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n",
       "        vocabulary=None)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Tfidf_vect"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5000\n",
      "['aa', 'aaa', 'ab', 'abandon', 'abbreviate', 'abc', 'abdomen', 'abdominal', 'ability', 'abit']\n"
     ]
    }
   ],
   "source": [
    "print(len(Tfidf_vect.get_feature_names()))\n",
    "print(Tfidf_vect.get_feature_names()[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vectorizer  = TfidfVectorizer(max_features=5000)\n",
    "X = vectorizer.fit_transform(Corpus['text_final'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1000x5000 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 31808 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "None\n"
     ]
    }
   ],
   "source": [
    "print(Tfidf_vect.get_stop_words())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1, 5000)\n",
      "<class 'numpy.ndarray'>\n"
     ]
    }
   ],
   "source": [
    "print(SVM.coef_.shape)\n",
    "print(type(SVM.coef_))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[-0.09050077  0.          0.10122429 ...  0.0352994  -0.07236617\n",
      "   0.        ]]\n"
     ]
    }
   ],
   "source": [
    "print(SVM.coef_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(5000,)\n",
      "<class 'numpy.ndarray'>\n"
     ]
    }
   ],
   "source": [
    "svm_bias = SVM.intercept_[0]\n",
    "svm_coef = SVM.coef_.reshape (SVM.coef_.shape[1])\n",
    "print(svm_coef.shape)\n",
    "print(type(svm_coef))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  (0, 4147)\t0.5199645135146538\n",
      "  (0, 3804)\t0.3850804174786943\n",
      "  (0, 2758)\t0.2432894135238121\n",
      "  (0, 2654)\t0.238784128756363\n",
      "  (0, 2466)\t0.16675247844565785\n",
      "  (0, 1770)\t0.17679709214511746\n",
      "  (0, 424)\t0.35403773308506453\n",
      "  (0, 410)\t0.3908483818512366\n",
      "  (0, 184)\t0.3577339835390647\n"
     ]
    }
   ],
   "source": [
    "test_str = \"This is awesome. Very good movie and amazing soundtrack. Not like those old average movies from Tarantino.\"\n",
    "test_str_Tfidf = Tfidf_vect.transform([test_str])\n",
    "print(test_str_Tfidf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1x5000 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 9 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_str_Tfidf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'_shape': (1, 5000),\n",
       " 'data': array([0.51996451, 0.38508042, 0.24328941, 0.23878413, 0.16675248,\n",
       "        0.17679709, 0.35403773, 0.39084838, 0.35773398]),\n",
       " 'indices': array([4147, 3804, 2758, 2654, 2466, 1770,  424,  410,  184], dtype=int32),\n",
       " 'indptr': array([0, 9], dtype=int32),\n",
       " 'maxprint': 50}"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_str_Tfidf.__dict__"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### test_str tokens weighted with tf-idf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{4147: 0.5199645135146538, 3804: 0.3850804174786943, 2758: 0.2432894135238121, 2654: 0.238784128756363, 2466: 0.16675247844565785, 1770: 0.17679709214511746, 424: 0.35403773308506453, 410: 0.3908483818512366, 184: 0.3577339835390647}\n"
     ]
    }
   ],
   "source": [
    "print({k:v for k,v in zip(test_str_Tfidf.indices, test_str_Tfidf.data)})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'tarantino': 0.5199645135146538, 'soundtrack': 0.3850804174786943, 'old': 0.2432894135238121, 'movie': 0.238784128756363, 'like': 0.16675247844565785, 'good': 0.17679709214511746, 'awesome': 0.35403773308506453, 'average': 0.3908483818512366, 'amazing': 0.3577339835390647}\n"
     ]
    }
   ],
   "source": [
    "print({Tfidf_vect.get_feature_names()[k]:v for k,v in zip(test_str_Tfidf.indices, test_str_Tfidf.data)})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### all svm weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "all_svm_weights={Tfidf_vect.get_feature_names()[k]:svm_coef[k] for k in range(0, svm_coef.shape[0])}\n",
    "all_nonzero_svm_weights={Tfidf_vect.get_feature_names()[k]:svm_coef[k] for k in range(0, svm_coef.shape[0]) if svm_coef[k]!=0.0}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### svm weights for the test_str tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'tarantino': -0.041343894497876434, 'soundtrack': 0.21409584517162913, 'old': -0.44373803191538447, 'movie': -0.41199304223305466, 'like': -0.20699017901400116, 'good': 1.1069508669130126, 'awesome': 0.8209169581723464, 'average': 0.15341076571124157, 'amazing': 1.006800041750699}\n"
     ]
    }
   ],
   "source": [
    "print({Tfidf_vect.get_feature_names()[k]:svm_coef[k] for k in test_str_Tfidf.indices})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.6308574307108017\n"
     ]
    }
   ],
   "source": [
    "test_str_raw_score_val = svm_bias + sum([svm_coef[k]*v for k,v in zip(test_str_Tfidf.indices, test_str_Tfidf.data)])\n",
    "print(test_str_raw_score_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "__label__2 \n",
      "0.6308574307108017\n"
     ]
    }
   ],
   "source": [
    "test_str_prediction_SVM = SVM.predict(test_str_Tfidf)\n",
    "test_str_distance = SVM.decision_function(test_str_Tfidf)\n",
    "print(Encoder.classes_[test_str_prediction_SVM[0]])\n",
    "print(test_str_distance[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Normalizing SVM predications to [0,1]\n",
    "https://www.csie.ntu.edu.tw/~cjlin/papers/plattprob.pdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def sigmoid(x, k=1):\n",
    "    return 1/(1+ math.exp(-k*x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.6526838565743812\n"
     ]
    }
   ],
   "source": [
    "print(sigmoid(test_str_distance[0]))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}